[llvm] r364204 - [AArch64] Regenerate 2velem tests. NFCI.

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 24 09:58:19 PDT 2019


Author: rksimon
Date: Mon Jun 24 09:58:19 2019
New Revision: 364204

URL: http://llvm.org/viewvc/llvm-project?rev=364204&view=rev
Log:
[AArch64] Regenerate 2velem tests. NFCI.

Prep work for an upcoming patch

Modified:
    llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll?rev=364204&r1=364203&r2=364204&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll Mon Jun 24 09:58:19 2019
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast                 | FileCheck %s --check-prefixes=CHECK,GENERIC
 ; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check.
 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck %s --check-prefixes=CHECK,EXYNOSM1
@@ -47,7 +48,10 @@ declare <4 x i32> @llvm.aarch64.neon.smu
 
 define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmla_lane_s16:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %b
@@ -57,7 +61,10 @@ entry:
 
 define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlaq_lane_s16:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mla v0.8h, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %b
@@ -67,7 +74,10 @@ entry:
 
 define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmla_lane_s32:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %b
@@ -77,7 +87,10 @@ entry:
 
 define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlaq_lane_s32:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mla v0.4s, v1.4s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %b
@@ -87,7 +100,9 @@ entry:
 
 define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmla_laneq_s16:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -97,7 +112,9 @@ entry:
 
 define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlaq_laneq_s16:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.8h, v1.8h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %b
@@ -107,7 +124,9 @@ entry:
 
 define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmla_laneq_s32:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -117,7 +136,9 @@ entry:
 
 define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlaq_laneq_s32:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.4s, v1.4s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %b
@@ -127,7 +148,10 @@ entry:
 
 define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmls_lane_s16:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %b
@@ -137,7 +161,10 @@ entry:
 
 define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsq_lane_s16:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mls v0.8h, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %b
@@ -147,7 +174,10 @@ entry:
 
 define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmls_lane_s32:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %b
@@ -157,7 +187,10 @@ entry:
 
 define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsq_lane_s32:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mls v0.4s, v1.4s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %b
@@ -167,7 +200,9 @@ entry:
 
 define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmls_laneq_s16:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -177,7 +212,9 @@ entry:
 
 define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsq_laneq_s16:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.8h, v1.8h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %b
@@ -187,7 +224,9 @@ entry:
 
 define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmls_laneq_s32:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -197,7 +236,9 @@ entry:
 
 define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsq_laneq_s32:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.4s, v1.4s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %b
@@ -207,7 +248,10 @@ entry:
 
 define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmul_lane_s16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %a
@@ -216,7 +260,10 @@ entry:
 
 define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_lane_s16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %a
@@ -225,7 +272,10 @@ entry:
 
 define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmul_lane_s32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %a
@@ -234,7 +284,10 @@ entry:
 
 define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_lane_s32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %a
@@ -243,7 +296,10 @@ entry:
 
 define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmul_lane_u16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %a
@@ -252,7 +308,10 @@ entry:
 
 define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_lane_u16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %a
@@ -261,7 +320,10 @@ entry:
 
 define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmul_lane_u32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %a
@@ -270,7 +332,10 @@ entry:
 
 define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_lane_u32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %a
@@ -279,7 +344,9 @@ entry:
 
 define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmul_laneq_s16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -288,7 +355,9 @@ entry:
 
 define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_s16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %a
@@ -297,7 +366,9 @@ entry:
 
 define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmul_laneq_s32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -306,7 +377,9 @@ entry:
 
 define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_s32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %a
@@ -315,7 +388,9 @@ entry:
 
 define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmul_laneq_u16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -324,7 +399,9 @@ entry:
 
 define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_u16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %a
@@ -333,7 +410,9 @@ entry:
 
 define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmul_laneq_u32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -342,7 +421,9 @@ entry:
 
 define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_u32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %a
@@ -350,11 +431,24 @@ entry:
 }
 
 define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfma_lane_f32:
-; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; GENERIC-LABEL: test_vfma_lane_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfma_lane_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[1]
+; EXYNOSM1-NEXT:    fmla v0.2s, v1.2s, v2.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfma_lane_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -364,11 +458,24 @@ entry:
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 
 define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfmaq_lane_f32:
-; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; GENERIC-LABEL: test_vfmaq_lane_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmla v0.4s, v1.4s, v2.s[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmaq_lane_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.4s, v2.s[1]
+; EXYNOSM1-NEXT:    fmla v0.4s, v1.4s, v2.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmaq_lane_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmla v0.4s, v1.4s, v2.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -378,11 +485,21 @@ entry:
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfma_laneq_f32:
-; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; GENERIC-LABEL: test_vfma_laneq_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfma_laneq_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[3]
+; EXYNOSM1-NEXT:    fmla v0.2s, v1.2s, v2.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfma_laneq_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -390,11 +507,21 @@ entry:
 }
 
 define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmaq_laneq_f32:
-; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; GENERIC-LABEL: test_vfmaq_laneq_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmla v0.4s, v1.4s, v2.s[3]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmaq_laneq_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.4s, v2.s[3]
+; EXYNOSM1-NEXT:    fmla v0.4s, v1.4s, v2.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmaq_laneq_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmla v0.4s, v1.4s, v2.s[3]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -402,11 +529,24 @@ entry:
 }
 
 define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfms_lane_f32:
-; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; GENERIC-LABEL: test_vfms_lane_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmls v0.2s, v1.2s, v2.s[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfms_lane_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[1]
+; EXYNOSM1-NEXT:    fmls v0.2s, v1.2s, v2.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfms_lane_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmls v0.2s, v1.2s, v2.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
@@ -415,11 +555,24 @@ entry:
 }
 
 define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfmsq_lane_f32:
-; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; GENERIC-LABEL: test_vfmsq_lane_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmls v0.4s, v1.4s, v2.s[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmsq_lane_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.4s, v2.s[1]
+; EXYNOSM1-NEXT:    fmls v0.4s, v1.4s, v2.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmsq_lane_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmls v0.4s, v1.4s, v2.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -428,11 +581,21 @@ entry:
 }
 
 define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfms_laneq_f32:
-; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; GENERIC-LABEL: test_vfms_laneq_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmls v0.2s, v1.2s, v2.s[3]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfms_laneq_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[3]
+; EXYNOSM1-NEXT:    fmls v0.2s, v1.2s, v2.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfms_laneq_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmls v0.2s, v1.2s, v2.s[3]
+; EXYNOSM3-NEXT:    ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
@@ -441,11 +604,21 @@ entry:
 }
 
 define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmsq_laneq_f32:
-; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; GENERIC-LABEL: test_vfmsq_laneq_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmls v0.4s, v1.4s, v2.s[3]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmsq_laneq_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.4s, v2.s[3]
+; EXYNOSM1-NEXT:    fmls v0.4s, v1.4s, v2.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmsq_laneq_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmls v0.4s, v1.4s, v2.s[3]
+; EXYNOSM3-NEXT:    ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -454,11 +627,24 @@ entry:
 }
 
 define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK-LABEL: test_vfmaq_lane_f64:
-; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; GENERIC-LABEL: test_vfmaq_lane_f64:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmla v0.2d, v1.2d, v2.d[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmaq_lane_f64:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.2d, v2.d[0]
+; EXYNOSM1-NEXT:    fmla v0.2d, v1.2d, v2.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmaq_lane_f64:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmla v0.2d, v1.2d, v2.d[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -468,11 +654,21 @@ entry:
 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
 
 define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmaq_laneq_f64:
-; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; GENERIC-LABEL: test_vfmaq_laneq_f64:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmla v0.2d, v1.2d, v2.d[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmaq_laneq_f64:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.2d, v2.d[1]
+; EXYNOSM1-NEXT:    fmla v0.2d, v1.2d, v2.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmaq_laneq_f64:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmla v0.2d, v1.2d, v2.d[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -480,11 +676,24 @@ entry:
 }
 
 define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK-LABEL: test_vfmsq_lane_f64:
-; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; GENERIC-LABEL: test_vfmsq_lane_f64:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmls v0.2d, v1.2d, v2.d[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmsq_lane_f64:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.2d, v2.d[0]
+; EXYNOSM1-NEXT:    fmls v0.2d, v1.2d, v2.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmsq_lane_f64:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmls v0.2d, v1.2d, v2.d[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %sub = fsub <1 x double> <double -0.000000e+00>, %v
   %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
@@ -493,11 +702,21 @@ entry:
 }
 
 define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmsq_laneq_f64:
-; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; GENERIC-LABEL: test_vfmsq_laneq_f64:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmls v0.2d, v1.2d, v2.d[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmsq_laneq_f64:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.2d, v2.d[1]
+; EXYNOSM1-NEXT:    fmls v0.2d, v1.2d, v2.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmsq_laneq_f64:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmls v0.2d, v1.2d, v2.d[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -506,8 +725,10 @@ entry:
 }
 
 define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmas_laneq_f32
-; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_vfmas_laneq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %extract = extractelement <4 x float> %v, i32 3
   %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
@@ -517,8 +738,10 @@ entry:
 declare float @llvm.fma.f32(float, float, float)
 
 define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
-; CHECK-LABEL: test_vfmsd_lane_f64
-; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-LABEL: test_vfmsd_lane_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d1, d2, d0
+; CHECK-NEXT:    ret
 entry:
   %extract.rhs = extractelement <1 x double> %v, i32 0
   %extract = fsub double -0.000000e+00, %extract.rhs
@@ -529,8 +752,11 @@ entry:
 declare double @llvm.fma.f64(double, double, double)
 
 define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfmss_lane_f32
-; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vfmss_lane_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmls s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %extract.rhs = extractelement <2 x float> %v, i32 1
   %extract = fsub float -0.000000e+00, %extract.rhs
@@ -539,8 +765,10 @@ entry:
 }
 
 define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmss_laneq_f32
-; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_vfmss_laneq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %extract.rhs = extractelement <4 x float> %v, i32 3
   %extract = fsub float -0.000000e+00, %extract.rhs
@@ -549,8 +777,10 @@ entry:
 }
 
 define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmsd_laneq_f64
-; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vfmsd_laneq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
 entry:
   %extract.rhs = extractelement <2 x double> %v, i32 1
   %extract = fsub double -0.000000e+00, %extract.rhs
@@ -559,9 +789,10 @@ entry:
 }
 
 define double @test_vfmsd_lane_f64_0(double %a, double %b, <1 x double> %v) {
-; CHECK-LABEL: test_vfmsd_lane_f64_0
-; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: ret
+; CHECK-LABEL: test_vfmsd_lane_f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d1, d2, d0
+; CHECK-NEXT:    ret
 entry:
   %tmp0 = fsub <1 x double> <double -0.000000e+00>, %v
   %tmp1 = extractelement <1 x double> %tmp0, i32 0
@@ -570,8 +801,11 @@ entry:
 }
 
 define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfmss_lane_f32_0
-; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vfmss_lane_f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmls s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %tmp1 = extractelement <2 x float> %tmp0, i32 1
@@ -580,8 +814,10 @@ entry:
 }
 
 define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmss_laneq_f32_0
-; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_vfmss_laneq_f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %tmp0 = fsub <4 x float><float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %tmp1 = extractelement <4 x float> %tmp0, i32 3
@@ -590,8 +826,10 @@ entry:
 }
 
 define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmsd_laneq_f64_0
-; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vfmsd_laneq_f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
 entry:
   %tmp0 = fsub <2 x double><double -0.000000e+00, double -0.000000e+00>, %v
   %tmp1 = extractelement <2 x double> %tmp0, i32 1
@@ -601,7 +839,10 @@ entry:
 
 define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_lane_s16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -611,7 +852,10 @@ entry:
 
 define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_lane_s32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -621,7 +865,9 @@ entry:
 
 define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_s16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -631,7 +877,9 @@ entry:
 
 define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_s32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -641,7 +889,10 @@ entry:
 
 define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_s16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlal2 v0.4s, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -652,7 +903,10 @@ entry:
 
 define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_s32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -663,7 +917,9 @@ entry:
 
 define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_s16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal2 v0.4s, v1.8h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -674,7 +930,9 @@ entry:
 
 define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_s32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -685,7 +943,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_s16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -695,7 +956,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_s32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -705,7 +969,9 @@ entry:
 
 define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_s16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -715,7 +981,9 @@ entry:
 
 define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_s32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -725,7 +993,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_s16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlsl2 v0.4s, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -736,7 +1007,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_s32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlsl2 v0.2d, v1.4s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -747,7 +1021,9 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_s16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl2 v0.4s, v1.8h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -758,7 +1034,9 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_s32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl2 v0.2d, v1.4s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -769,7 +1047,10 @@ entry:
 
 define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_lane_u16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -779,7 +1060,10 @@ entry:
 
 define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_lane_u32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -789,7 +1073,9 @@ entry:
 
 define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_u16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -799,7 +1085,9 @@ entry:
 
 define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_u32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -809,7 +1097,10 @@ entry:
 
 define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_u16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlal2 v0.4s, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -820,7 +1111,10 @@ entry:
 
 define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_u32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -831,7 +1125,9 @@ entry:
 
 define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_u16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal2 v0.4s, v1.8h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -842,7 +1138,9 @@ entry:
 
 define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_u32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -853,7 +1151,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_u16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -863,7 +1164,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_u32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -873,7 +1177,9 @@ entry:
 
 define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_u16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -883,7 +1189,9 @@ entry:
 
 define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_u32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -893,7 +1201,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_u16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlsl2 v0.4s, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -904,7 +1215,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_u32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlsl2 v0.2d, v1.4s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -915,7 +1229,9 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_u16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl2 v0.4s, v1.8h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -926,7 +1242,9 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_u32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl2 v0.2d, v1.4s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -937,7 +1255,10 @@ entry:
 
 define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_lane_s16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -946,7 +1267,10 @@ entry:
 
 define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_lane_s32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -955,7 +1279,10 @@ entry:
 
 define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_lane_u16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -964,7 +1291,10 @@ entry:
 
 define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_lane_u32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -973,7 +1303,10 @@ entry:
 
 define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_s16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -983,7 +1316,10 @@ entry:
 
 define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_s32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -993,7 +1329,10 @@ entry:
 
 define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_u16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1003,7 +1342,10 @@ entry:
 
 define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_u32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umull2 v0.2d, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1013,7 +1355,9 @@ entry:
 
 define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_laneq_s16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1022,7 +1366,9 @@ entry:
 
 define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_laneq_s32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1031,7 +1377,9 @@ entry:
 
 define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_laneq_u16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1040,7 +1388,9 @@ entry:
 
 define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_laneq_u32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1049,7 +1399,9 @@ entry:
 
 define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_s16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1059,7 +1411,9 @@ entry:
 
 define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_s32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1069,7 +1423,9 @@ entry:
 
 define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_u16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1079,7 +1435,9 @@ entry:
 
 define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_u32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull2 v0.2d, v0.4s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1089,7 +1447,10 @@ entry:
 
 define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlal_lane_s16:
-; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlal v0.4s, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1099,7 +1460,10 @@ entry:
 
 define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlal_lane_s32:
-; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlal v0.2d, v1.2s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1109,7 +1473,10 @@ entry:
 
 define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlal_high_lane_s16:
-; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1120,7 +1487,10 @@ entry:
 
 define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlal_high_lane_s32:
-; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlal2 v0.2d, v1.4s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1131,7 +1501,10 @@ entry:
 
 define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlsl_lane_s16:
-; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlsl v0.4s, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1141,7 +1514,10 @@ entry:
 
 define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlsl_lane_s32:
-; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlsl v0.2d, v1.2s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1151,7 +1527,10 @@ entry:
 
 define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
-; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlsl2 v0.4s, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1162,7 +1541,10 @@ entry:
 
 define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
-; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlsl2 v0.2d, v1.4s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1173,7 +1555,10 @@ entry:
 
 define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_lane_s16:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1182,7 +1567,10 @@ entry:
 
 define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_lane_s32:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1191,7 +1579,9 @@ entry:
 
 define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_laneq_s16:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1200,7 +1590,9 @@ entry:
 
 define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_laneq_s32:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1209,7 +1601,10 @@ entry:
 
 define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_high_lane_s16:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1219,7 +1614,10 @@ entry:
 
 define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_high_lane_s32:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmull2 v0.2d, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1229,7 +1627,9 @@ entry:
 
 define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_high_laneq_s16:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1239,7 +1639,9 @@ entry:
 
 define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_high_laneq_s32:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull2 v0.2d, v0.4s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1249,7 +1651,10 @@ entry:
 
 define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmulh_lane_s16:
-; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1258,7 +1663,10 @@ entry:
 
 define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s16:
-; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -1267,7 +1675,10 @@ entry:
 
 define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulh_lane_s32:
-; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1276,7 +1687,10 @@ entry:
 
 define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s32:
-; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -1285,7 +1699,10 @@ entry:
 
 define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s16:
-; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1294,7 +1711,10 @@ entry:
 
 define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s16:
-; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -1303,7 +1723,10 @@ entry:
 
 define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s32:
-; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1312,7 +1735,10 @@ entry:
 
 define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s32:
-; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -1320,11 +1746,24 @@ entry:
 }
 
 define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmul_lane_f32:
-; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; GENERIC-LABEL: test_vmul_lane_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d1 killed $d1 def $q1
+; GENERIC-NEXT:    fmul v0.2s, v0.2s, v1.s[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmul_lane_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM1-NEXT:    dup v1.2s, v1.s[1]
+; EXYNOSM1-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmul_lane_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM3-NEXT:    fmul v0.2s, v0.2s, v1.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %mul = fmul <2 x float> %shuffle, %a
@@ -1333,7 +1772,9 @@ entry:
 
 define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
 ; CHECK-LABEL: test_vmul_lane_f64:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -1344,11 +1785,24 @@ entry:
 }
 
 define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulq_lane_f32:
-; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; GENERIC-LABEL: test_vmulq_lane_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d1 killed $d1 def $q1
+; GENERIC-NEXT:    fmul v0.4s, v0.4s, v1.s[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulq_lane_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM1-NEXT:    dup v1.4s, v1.s[1]
+; EXYNOSM1-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulq_lane_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM3-NEXT:    fmul v0.4s, v0.4s, v1.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = fmul <4 x float> %shuffle, %a
@@ -1356,11 +1810,24 @@ entry:
 }
 
 define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK-LABEL: test_vmulq_lane_f64:
-; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-; EXYNOSM3: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; GENERIC-LABEL: test_vmulq_lane_f64:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d1 killed $d1 def $q1
+; GENERIC-NEXT:    fmul v0.2d, v0.2d, v1.d[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulq_lane_f64:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM1-NEXT:    dup v1.2d, v1.d[0]
+; EXYNOSM1-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulq_lane_f64:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM3-NEXT:    fmul v0.2d, v0.2d, v1.d[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x double> %shuffle, %a
@@ -1368,11 +1835,21 @@ entry:
 }
 
 define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmul_laneq_f32:
-; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; GENERIC-LABEL: test_vmul_laneq_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmul_laneq_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.2s, v1.s[3]
+; EXYNOSM1-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmul_laneq_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %mul = fmul <2 x float> %shuffle, %a
@@ -1381,7 +1858,9 @@ entry:
 
 define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmul_laneq_f64:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmul d0, d0, v1.d[1]
+; CHECK-NEXT:    ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -1392,11 +1871,21 @@ entry:
 }
 
 define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulq_laneq_f32:
-; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; GENERIC-LABEL: test_vmulq_laneq_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmul v0.4s, v0.4s, v1.s[3]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulq_laneq_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.4s, v1.s[3]
+; EXYNOSM1-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulq_laneq_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmul v0.4s, v0.4s, v1.s[3]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = fmul <4 x float> %shuffle, %a
@@ -1404,11 +1893,21 @@ entry:
 }
 
 define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK-LABEL: test_vmulq_laneq_f64:
-; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; GENERIC-LABEL: test_vmulq_laneq_f64:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmul v0.2d, v0.2d, v1.d[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulq_laneq_f64:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.2d, v1.d[1]
+; EXYNOSM1-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulq_laneq_f64:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmul v0.2d, v0.2d, v1.d[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %mul = fmul <2 x double> %shuffle, %a
@@ -1416,11 +1915,24 @@ entry:
 }
 
 define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulx_lane_f32:
-; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; GENERIC-LABEL: test_vmulx_lane_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d1 killed $d1 def $q1
+; GENERIC-NEXT:    fmulx v0.2s, v0.2s, v1.s[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulx_lane_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM1-NEXT:    dup v1.2s, v1.s[1]
+; EXYNOSM1-NEXT:    fmulx v0.2s, v0.2s, v1.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulx_lane_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM3-NEXT:    fmulx v0.2s, v0.2s, v1.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1428,11 +1940,24 @@ entry:
 }
 
 define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulxq_lane_f32:
-; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; GENERIC-LABEL: test_vmulxq_lane_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d1 killed $d1 def $q1
+; GENERIC-NEXT:    fmulx v0.4s, v0.4s, v1.s[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulxq_lane_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM1-NEXT:    dup v1.4s, v1.s[1]
+; EXYNOSM1-NEXT:    fmulx v0.4s, v0.4s, v1.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulxq_lane_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM3-NEXT:    fmulx v0.4s, v0.4s, v1.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1440,11 +1965,24 @@ entry:
 }
 
 define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK-LABEL: test_vmulxq_lane_f64:
-; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; GENERIC-LABEL: test_vmulxq_lane_f64:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d1 killed $d1 def $q1
+; GENERIC-NEXT:    fmulx v0.2d, v0.2d, v1.d[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulxq_lane_f64:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM1-NEXT:    dup v1.2d, v1.d[0]
+; EXYNOSM1-NEXT:    fmulx v0.2d, v0.2d, v1.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulxq_lane_f64:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM3-NEXT:    fmulx v0.2d, v0.2d, v1.d[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1452,11 +1990,21 @@ entry:
 }
 
 define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulx_laneq_f32:
-; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; GENERIC-LABEL: test_vmulx_laneq_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmulx v0.2s, v0.2s, v1.s[3]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulx_laneq_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.2s, v1.s[3]
+; EXYNOSM1-NEXT:    fmulx v0.2s, v0.2s, v1.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulx_laneq_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmulx v0.2s, v0.2s, v1.s[3]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1464,11 +2012,21 @@ entry:
 }
 
 define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulxq_laneq_f32:
-; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; GENERIC-LABEL: test_vmulxq_laneq_f32:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmulx v0.4s, v0.4s, v1.s[3]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulxq_laneq_f32:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.4s, v1.s[3]
+; EXYNOSM1-NEXT:    fmulx v0.4s, v0.4s, v1.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulxq_laneq_f32:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmulx v0.4s, v0.4s, v1.s[3]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1476,11 +2034,21 @@ entry:
 }
 
 define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK-LABEL: test_vmulxq_laneq_f64:
-; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; GENERIC-LABEL: test_vmulxq_laneq_f64:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmulx v0.2d, v0.2d, v1.d[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulxq_laneq_f64:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.2d, v1.d[1]
+; EXYNOSM1-NEXT:    fmulx v0.2d, v0.2d, v1.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulxq_laneq_f64:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmulx v0.2d, v0.2d, v1.d[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1489,7 +2057,10 @@ entry:
 
 define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmla_lane_s16_0:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1499,7 +2070,10 @@ entry:
 
 define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlaq_lane_s16_0:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mla v0.8h, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1509,7 +2083,10 @@ entry:
 
 define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmla_lane_s32_0:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1519,7 +2096,10 @@ entry:
 
 define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlaq_lane_s32_0:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mla v0.4s, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1529,7 +2109,9 @@ entry:
 
 define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmla_laneq_s16_0:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1539,7 +2121,9 @@ entry:
 
 define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlaq_laneq_s16_0:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.8h, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1549,7 +2133,9 @@ entry:
 
 define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmla_laneq_s32_0:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1559,7 +2145,9 @@ entry:
 
 define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlaq_laneq_s32_0:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.4s, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1569,7 +2157,10 @@ entry:
 
 define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmls_lane_s16_0:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1579,7 +2170,10 @@ entry:
 
 define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsq_lane_s16_0:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mls v0.8h, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1589,7 +2183,10 @@ entry:
 
 define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmls_lane_s32_0:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1599,7 +2196,10 @@ entry:
 
 define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsq_lane_s32_0:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mls v0.4s, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1609,7 +2209,9 @@ entry:
 
 define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmls_laneq_s16_0:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1619,7 +2221,9 @@ entry:
 
 define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsq_laneq_s16_0:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.8h, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1629,7 +2233,9 @@ entry:
 
 define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmls_laneq_s32_0:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1639,7 +2245,9 @@ entry:
 
 define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsq_laneq_s32_0:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.4s, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1649,7 +2257,10 @@ entry:
 
 define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmul_lane_s16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1658,7 +2269,10 @@ entry:
 
 define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_lane_s16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1667,7 +2281,10 @@ entry:
 
 define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmul_lane_s32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1676,7 +2293,10 @@ entry:
 
 define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_lane_s32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -1685,7 +2305,10 @@ entry:
 
 define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmul_lane_u16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1694,7 +2317,10 @@ entry:
 
 define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_lane_u16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1703,7 +2329,10 @@ entry:
 
 define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmul_lane_u32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1712,7 +2341,10 @@ entry:
 
 define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_lane_u32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -1721,7 +2353,9 @@ entry:
 
 define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmul_laneq_s16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1730,7 +2364,9 @@ entry:
 
 define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_s16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1739,7 +2375,9 @@ entry:
 
 define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmul_laneq_s32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1748,7 +2386,9 @@ entry:
 
 define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_s32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -1757,7 +2397,9 @@ entry:
 
 define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmul_laneq_u16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1766,7 +2408,9 @@ entry:
 
 define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_u16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1775,7 +2419,9 @@ entry:
 
 define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmul_laneq_u32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1784,7 +2430,9 @@ entry:
 
 define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_u32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -1792,11 +2440,24 @@ entry:
 }
 
 define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfma_lane_f32_0:
-; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vfma_lane_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmla v0.2s, v1.2s, v2.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfma_lane_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[0]
+; EXYNOSM1-NEXT:    fmla v0.2s, v1.2s, v2.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfma_lane_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmla v0.2s, v1.2s, v2.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -1804,11 +2465,24 @@ entry:
 }
 
 define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfmaq_lane_f32_0:
-; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vfmaq_lane_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmla v0.4s, v1.4s, v2.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmaq_lane_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.4s, v2.s[0]
+; EXYNOSM1-NEXT:    fmla v0.4s, v1.4s, v2.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmaq_lane_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmla v0.4s, v1.4s, v2.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -1816,11 +2490,21 @@ entry:
 }
 
 define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfma_laneq_f32_0:
-; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vfma_laneq_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmla v0.2s, v1.2s, v2.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfma_laneq_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[0]
+; EXYNOSM1-NEXT:    fmla v0.2s, v1.2s, v2.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfma_laneq_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmla v0.2s, v1.2s, v2.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -1828,11 +2512,21 @@ entry:
 }
 
 define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmaq_laneq_f32_0:
-; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vfmaq_laneq_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmla v0.4s, v1.4s, v2.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmaq_laneq_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.4s, v2.s[0]
+; EXYNOSM1-NEXT:    fmla v0.4s, v1.4s, v2.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmaq_laneq_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmla v0.4s, v1.4s, v2.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -1840,11 +2534,24 @@ entry:
 }
 
 define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfms_lane_f32_0:
-; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vfms_lane_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmls v0.2s, v1.2s, v2.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfms_lane_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[0]
+; EXYNOSM1-NEXT:    fmls v0.2s, v1.2s, v2.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfms_lane_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmls v0.2s, v1.2s, v2.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
@@ -1853,11 +2560,24 @@ entry:
 }
 
 define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfmsq_lane_f32_0:
-; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vfmsq_lane_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmls v0.4s, v1.4s, v2.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmsq_lane_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.4s, v2.s[0]
+; EXYNOSM1-NEXT:    fmls v0.4s, v1.4s, v2.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmsq_lane_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmls v0.4s, v1.4s, v2.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
@@ -1866,11 +2586,21 @@ entry:
 }
 
 define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfms_laneq_f32_0:
-; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vfms_laneq_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmls v0.2s, v1.2s, v2.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfms_laneq_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[0]
+; EXYNOSM1-NEXT:    fmls v0.2s, v1.2s, v2.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfms_laneq_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmls v0.2s, v1.2s, v2.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
@@ -1879,11 +2609,21 @@ entry:
 }
 
 define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmsq_laneq_f32_0:
-; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vfmsq_laneq_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmls v0.4s, v1.4s, v2.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmsq_laneq_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.4s, v2.s[0]
+; EXYNOSM1-NEXT:    fmls v0.4s, v1.4s, v2.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmsq_laneq_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmls v0.4s, v1.4s, v2.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
@@ -1892,11 +2632,21 @@ entry:
 }
 
 define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmaq_laneq_f64_0:
-; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; GENERIC-LABEL: test_vfmaq_laneq_f64_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmla v0.2d, v1.2d, v2.d[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmaq_laneq_f64_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.2d, v2.d[0]
+; EXYNOSM1-NEXT:    fmla v0.2d, v1.2d, v2.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmaq_laneq_f64_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmla v0.2d, v1.2d, v2.d[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -1904,11 +2654,21 @@ entry:
 }
 
 define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmsq_laneq_f64_0:
-; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; GENERIC-LABEL: test_vfmsq_laneq_f64_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmls v0.2d, v1.2d, v2.d[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfmsq_laneq_f64_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v2.2d, v2.d[0]
+; EXYNOSM1-NEXT:    fmls v0.2d, v1.2d, v2.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfmsq_laneq_f64_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmls v0.2d, v1.2d, v2.d[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
@@ -1918,7 +2678,10 @@ entry:
 
 define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_lane_s16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1928,7 +2691,10 @@ entry:
 
 define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_lane_s32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1938,7 +2704,9 @@ entry:
 
 define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_s16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1948,7 +2716,9 @@ entry:
 
 define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_s32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1958,7 +2728,10 @@ entry:
 
 define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_s16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlal2 v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -1969,7 +2742,10 @@ entry:
 
 define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_s32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -1980,7 +2756,9 @@ entry:
 
 define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal2 v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -1991,7 +2769,9 @@ entry:
 
 define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2002,7 +2782,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_s16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2012,7 +2795,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_s32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2022,7 +2808,9 @@ entry:
 
 define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_s16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2032,7 +2820,9 @@ entry:
 
 define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_s32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2042,7 +2832,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlsl2 v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2053,7 +2846,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    smlsl2 v0.2d, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2064,7 +2860,9 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl2 v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2075,7 +2873,9 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl2 v0.2d, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2086,7 +2886,10 @@ entry:
 
 define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_lane_u16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2096,7 +2899,10 @@ entry:
 
 define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_lane_u32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2106,7 +2912,9 @@ entry:
 
 define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_u16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2116,7 +2924,9 @@ entry:
 
 define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_u32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2126,7 +2936,10 @@ entry:
 
 define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_u16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlal2 v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2137,7 +2950,10 @@ entry:
 
 define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_u32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2148,7 +2964,9 @@ entry:
 
 define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal2 v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2159,7 +2977,9 @@ entry:
 
 define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2170,7 +2990,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_u16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2180,7 +3003,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_u32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2190,7 +3016,9 @@ entry:
 
 define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_u16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2200,7 +3028,9 @@ entry:
 
 define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_u32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2210,7 +3040,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlsl2 v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2221,7 +3054,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umlsl2 v0.2d, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2232,7 +3068,9 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl2 v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2243,7 +3081,9 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl2 v0.2d, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2254,7 +3094,10 @@ entry:
 
 define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_lane_s16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2263,7 +3106,10 @@ entry:
 
 define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_lane_s32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2272,7 +3118,10 @@ entry:
 
 define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_lane_u16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2281,7 +3130,10 @@ entry:
 
 define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_lane_u32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2290,7 +3142,10 @@ entry:
 
 define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_s16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2300,7 +3155,10 @@ entry:
 
 define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_s32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2310,7 +3168,10 @@ entry:
 
 define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_u16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2320,7 +3181,10 @@ entry:
 
 define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_u32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umull2 v0.2d, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2330,7 +3194,9 @@ entry:
 
 define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_laneq_s16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2339,7 +3205,9 @@ entry:
 
 define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_laneq_s32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2348,7 +3216,9 @@ entry:
 
 define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_laneq_u16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2357,7 +3227,9 @@ entry:
 
 define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_laneq_u32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2366,7 +3238,9 @@ entry:
 
 define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_s16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2376,7 +3250,9 @@ entry:
 
 define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_s32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2386,7 +3262,9 @@ entry:
 
 define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_u16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2396,7 +3274,9 @@ entry:
 
 define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_u32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull2 v0.2d, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2406,7 +3286,10 @@ entry:
 
 define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlal_lane_s16_0:
-; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlal v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2416,7 +3299,10 @@ entry:
 
 define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlal_lane_s32_0:
-; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlal v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2426,7 +3312,10 @@ entry:
 
 define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
-; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2437,7 +3326,10 @@ entry:
 
 define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
-; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlal2 v0.2d, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2448,7 +3340,10 @@ entry:
 
 define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
-; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlsl v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2458,7 +3353,10 @@ entry:
 
 define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
-; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlsl v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2468,7 +3366,10 @@ entry:
 
 define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
-; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlsl2 v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2479,7 +3380,10 @@ entry:
 
 define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
-; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqdmlsl2 v0.2d, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2490,7 +3394,10 @@ entry:
 
 define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_lane_s16_0:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2499,7 +3406,10 @@ entry:
 
 define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_lane_s32_0:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2508,7 +3418,9 @@ entry:
 
 define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_laneq_s16_0:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2517,7 +3429,9 @@ entry:
 
 define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_laneq_s32_0:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2526,7 +3440,10 @@ entry:
 
 define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2536,7 +3453,10 @@ entry:
 
 define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmull2 v0.2d, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2546,7 +3466,9 @@ entry:
 
 define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2556,7 +3478,9 @@ entry:
 
 define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull2 v0.2d, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2566,7 +3490,10 @@ entry:
 
 define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmulh_lane_s16_0:
-; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.4h, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2575,7 +3502,10 @@ entry:
 
 define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
-; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.8h, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -2584,7 +3514,10 @@ entry:
 
 define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulh_lane_s32_0:
-; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2593,7 +3526,10 @@ entry:
 
 define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
-; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.4s, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -2602,7 +3538,10 @@ entry:
 
 define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
-; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2611,7 +3550,10 @@ entry:
 
 define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
-; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -2620,7 +3562,10 @@ entry:
 
 define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
-; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2629,7 +3574,10 @@ entry:
 
 define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
-; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -2637,11 +3585,24 @@ entry:
 }
 
 define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmul_lane_f32_0:
-; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vmul_lane_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d1 killed $d1 def $q1
+; GENERIC-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmul_lane_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM1-NEXT:    dup v1.2s, v1.s[0]
+; EXYNOSM1-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmul_lane_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM3-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x float> %shuffle, %a
@@ -2649,11 +3610,24 @@ entry:
 }
 
 define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulq_lane_f32_0:
-; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vmulq_lane_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d1 killed $d1 def $q1
+; GENERIC-NEXT:    fmul v0.4s, v0.4s, v1.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulq_lane_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM1-NEXT:    dup v1.4s, v1.s[0]
+; EXYNOSM1-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulq_lane_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM3-NEXT:    fmul v0.4s, v0.4s, v1.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %mul = fmul <4 x float> %shuffle, %a
@@ -2661,11 +3635,21 @@ entry:
 }
 
 define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmul_laneq_f32_0:
-; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vmul_laneq_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmul_laneq_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.2s, v1.s[0]
+; EXYNOSM1-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmul_laneq_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x float> %shuffle, %a
@@ -2674,7 +3658,9 @@ entry:
 
 define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmul_laneq_f64_0:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmul d0, d0, v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -2685,11 +3671,21 @@ entry:
 }
 
 define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulq_laneq_f32_0:
-; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vmulq_laneq_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmul v0.4s, v0.4s, v1.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulq_laneq_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.4s, v1.s[0]
+; EXYNOSM1-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulq_laneq_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmul v0.4s, v0.4s, v1.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %mul = fmul <4 x float> %shuffle, %a
@@ -2697,11 +3693,21 @@ entry:
 }
 
 define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK-LABEL: test_vmulq_laneq_f64_0:
-; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; GENERIC-LABEL: test_vmulq_laneq_f64_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmul v0.2d, v0.2d, v1.d[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulq_laneq_f64_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.2d, v1.d[0]
+; EXYNOSM1-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulq_laneq_f64_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmul v0.2d, v0.2d, v1.d[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x double> %shuffle, %a
@@ -2709,11 +3715,24 @@ entry:
 }
 
 define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulx_lane_f32_0:
-; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vmulx_lane_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d1 killed $d1 def $q1
+; GENERIC-NEXT:    fmulx v0.2s, v0.2s, v1.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulx_lane_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM1-NEXT:    dup v1.2s, v1.s[0]
+; EXYNOSM1-NEXT:    fmulx v0.2s, v0.2s, v1.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulx_lane_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM3-NEXT:    fmulx v0.2s, v0.2s, v1.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -2721,11 +3740,24 @@ entry:
 }
 
 define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulxq_lane_f32_0:
-; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vmulxq_lane_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d1 killed $d1 def $q1
+; GENERIC-NEXT:    fmulx v0.4s, v0.4s, v1.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulxq_lane_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM1-NEXT:    dup v1.4s, v1.s[0]
+; EXYNOSM1-NEXT:    fmulx v0.4s, v0.4s, v1.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulxq_lane_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM3-NEXT:    fmulx v0.4s, v0.4s, v1.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -2733,11 +3765,24 @@ entry:
 }
 
 define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
-; CHECK-LABEL: test_vmulxq_lane_f64_0:
-; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; GENERIC-LABEL: test_vmulxq_lane_f64_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d1 killed $d1 def $q1
+; GENERIC-NEXT:    fmulx v0.2d, v0.2d, v1.d[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulxq_lane_f64_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM1-NEXT:    dup v1.2d, v1.d[0]
+; EXYNOSM1-NEXT:    fmulx v0.2d, v0.2d, v1.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulxq_lane_f64_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d1 killed $d1 def $q1
+; EXYNOSM3-NEXT:    fmulx v0.2d, v0.2d, v1.d[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -2745,11 +3790,21 @@ entry:
 }
 
 define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulx_laneq_f32_0:
-; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
-; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vmulx_laneq_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmulx v0.2s, v0.2s, v1.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulx_laneq_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.2s, v1.s[0]
+; EXYNOSM1-NEXT:    fmulx v0.2s, v0.2s, v1.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulx_laneq_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmulx v0.2s, v0.2s, v1.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -2757,11 +3812,21 @@ entry:
 }
 
 define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulxq_laneq_f32_0:
-; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; GENERIC-LABEL: test_vmulxq_laneq_f32_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmulx v0.4s, v0.4s, v1.s[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulxq_laneq_f32_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.4s, v1.s[0]
+; EXYNOSM1-NEXT:    fmulx v0.4s, v0.4s, v1.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulxq_laneq_f32_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmulx v0.4s, v0.4s, v1.s[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -2769,11 +3834,21 @@ entry:
 }
 
 define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK-LABEL: test_vmulxq_laneq_f64_0:
-; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
-; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; GENERIC-LABEL: test_vmulxq_laneq_f64_0:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmulx v0.2d, v0.2d, v1.d[0]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vmulxq_laneq_f64_0:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v1.2d, v1.d[0]
+; EXYNOSM1-NEXT:    fmulx v0.2d, v0.2d, v1.2d
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vmulxq_laneq_f64_0:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmulx v0.2d, v0.2d, v1.d[0]
+; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -2781,14 +3856,24 @@ entry:
 }
 
 define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
-; CHECK-LABEL: optimize_dup:
-; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; GENERIC-LABEL: optimize_dup:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmla v0.4s, v1.4s, v3.s[3]
+; GENERIC-NEXT:    fmls v0.4s, v2.4s, v3.s[3]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: optimize_dup:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v3.4s, v3.s[3]
+; EXYNOSM1-NEXT:    fmla v0.4s, v1.4s, v3.4s
+; EXYNOSM1-NEXT:    fmls v0.4s, v2.4s, v3.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: optimize_dup:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmla v0.4s, v1.4s, v3.s[3]
+; EXYNOSM3-NEXT:    fmls v0.4s, v2.4s, v3.s[3]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
@@ -2799,15 +3884,25 @@ entry:
 }
 
 define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
-; CHECK-LABEL: no_optimize_dup:
-; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
-; EXYNOSM1: dup  [[W:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[W]].4s
-; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; GENERIC-LABEL: no_optimize_dup:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    fmla v0.4s, v1.4s, v3.s[3]
+; GENERIC-NEXT:    fmls v0.4s, v2.4s, v3.s[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: no_optimize_dup:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    dup v4.4s, v3.s[3]
+; EXYNOSM1-NEXT:    fmla v0.4s, v1.4s, v4.4s
+; EXYNOSM1-NEXT:    dup v1.4s, v3.s[1]
+; EXYNOSM1-NEXT:    fmls v0.4s, v2.4s, v1.4s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: no_optimize_dup:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    fmla v0.4s, v1.4s, v3.s[3]
+; EXYNOSM3-NEXT:    fmls v0.4s, v2.4s, v3.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
@@ -2818,8 +3913,24 @@ entry:
 }
 
 define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_a57(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="cortex-a57" {
-; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57:
-; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; GENERIC-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[1]
+; EXYNOSM1-NEXT:    fmla v0.2s, v1.2s, v2.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -2827,9 +3938,25 @@ entry:
 }
 
 define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m1(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m1" {
-; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1:
-; GENERIC: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
+; GENERIC-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    dup v2.2s, v2.s[1]
+; GENERIC-NEXT:    fmla v0.2s, v1.2s, v2.2s
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[1]
+; EXYNOSM1-NEXT:    fmla v0.2s, v1.2s, v2.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -2837,8 +3964,24 @@ entry:
 }
 
 define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m3(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m3" {
-; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m3:
-; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; GENERIC-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m3:
+; GENERIC:       // %bb.0: // %entry
+; GENERIC-NEXT:    // kill: def $d2 killed $d2 def $q2
+; GENERIC-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
+; GENERIC-NEXT:    ret
+;
+; EXYNOSM1-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m3:
+; EXYNOSM1:       // %bb.0: // %entry
+; EXYNOSM1-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[1]
+; EXYNOSM1-NEXT:    fmla v0.2s, v1.2s, v2.2s
+; EXYNOSM1-NEXT:    ret
+;
+; EXYNOSM3-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m3:
+; EXYNOSM3:       // %bb.0: // %entry
+; EXYNOSM3-NEXT:    // kill: def $d2 killed $d2 def $q2
+; EXYNOSM3-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
+; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)




More information about the llvm-commits mailing list