[llvm] [AArch64][GlobalISel] Combine G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16 (PR #142731)

Wed Jun 4 00:21:26 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

<details>
<summary>Changes</summary>

We will generate G_UNMERGE(G_DUPLANE16) due to the legalization of shuffle vector splats with mismatching vector sizes. The G_DUPLANE intrinsics can handle different vector sizes (128bit and 64bit output, for example), and we can combine away the unmerge.

---

Patch is 60.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142731.diff


5 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64Combine.td (+16-1) 
- (modified) llvm/test/CodeGen/AArch64/arm64-dup.ll (+4-10) 
- (modified) llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll (+228-556) 
- (modified) llvm/test/CodeGen/AArch64/arm64-neon-copy.ll (+12-30) 
- (modified) llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll (+16-40) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index f84e83816bf33..9fe331d5370de 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -172,6 +172,20 @@ def form_duplane : GICombineRule <
   (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }])
 >;
 
+// Clean up G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16
+class unmerge_duplane<Instruction Op> : GICombineRule <
+  (defs root:$root),
+  (match (Op $a, $src, $c),
+         (G_UNMERGE_VALUES $d1, $d2, $a):$root,
+         [{ return MRI.getType(${d1}.getReg()).getSizeInBits() == 64; }]),
+  (apply (GIReplaceReg $d2, $d1), (Op $d1, $src, $c))
+>;
+def unmerge_duplane8 : unmerge_duplane<G_DUPLANE8>;
+def unmerge_duplane16 : unmerge_duplane<G_DUPLANE16>;
+def unmerge_duplane32 : unmerge_duplane<G_DUPLANE32>;
+def unmerge_duplane64 : unmerge_duplane<G_DUPLANE64>;
+def unmerge_duplanes : GICombineGroup<[unmerge_duplane8, unmerge_duplane16, unmerge_duplane32, unmerge_duplane64]>;
+
 def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn, fullrev,
                                               form_duplane, shuf_to_ins]>;
 
@@ -325,7 +339,8 @@ def AArch64PostLegalizerLowering
                         lower_vector_fcmp, form_truncstore,
                         vector_sext_inreg_to_shift,
                         unmerge_ext_to_unmerge, lower_mulv2s64,
-                        vector_unmerge_lowering, insertelt_nonconst]> {
+                        vector_unmerge_lowering, insertelt_nonconst,
+                        unmerge_duplanes]> {
 }
 
 // Post-legalization combines which are primarily optimizations.
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 4c28ea7592202..12bf09e02aaf9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -422,16 +422,10 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
 ; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
 ; the formation of an indexed-by-7 MLS.
 define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
-; CHECK-SD-LABEL: test_high_splat:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls.4h v0, v1, v2[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_high_splat:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup.8h v2, v2[7]
-; CHECK-GI-NEXT:    mls.4h v0, v2, v1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_high_splat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls.4h v0, v1, v2[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index c3ad3b4192cf9..85d8b7c3e2866 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -159,16 +159,10 @@ entry:
 }
 
 define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    mla v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -189,16 +183,10 @@ entry:
 }
 
 define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    mla v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -271,16 +259,10 @@ entry:
 }
 
 define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    mls v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -301,16 +283,10 @@ entry:
 }
 
 define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -427,16 +403,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -455,16 +425,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -483,16 +447,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -511,16 +469,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -567,16 +519,10 @@ entry:
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vfma_laneq_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vfma_laneq_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    fmla v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vfma_laneq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -834,16 +780,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -852,16 +792,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -920,8 +854,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -940,8 +873,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -978,16 +910,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -996,16 +922,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1064,8 +984,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1084,8 +1003,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1122,16 +1040,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1140,16 +1052,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1208,8 +1114,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1228,8 +1133,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1266,16 +1170,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1284,16 +1182,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1352,8 +1244,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1372,8 +1263,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1512,16 +1402,10 @@ entry:
 }
 
 define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smull v0.4s, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    du...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/142731