[llvm] 1e9d068 - [AArch64] Add extra fma+extract test cases. NFC

Tue Nov 5 02:50:19 PST 2024

Author: David Green
Date: 2024-11-05T10:50:14Z
New Revision: 1e9d0685c5565074871b6af289ea0be4e4333c9b

URL: https://github.com/llvm/llvm-project/commit/1e9d0685c5565074871b6af289ea0be4e4333c9b
DIFF: https://github.com/llvm/llvm-project/commit/1e9d0685c5565074871b6af289ea0be4e4333c9b.diff

LOG: [AArch64] Add extra fma+extract test cases. NFC

Added: 
    

Modified: 
    llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
    llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
index e9fbaf6f046d06..725c44c9788988 100644

--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -8,7 +8,7 @@ declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
 declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
 declare half @llvm.fma.f16(half, half, half) #1
 
-define dso_local <4 x half> @t_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
+define <4 x half> @t_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfma_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -20,7 +20,7 @@ entry:
   ret <4 x half> %fmla3
 }
 
-define dso_local <8 x half> @t_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
+define <8 x half> @t_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmaq_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -32,7 +32,7 @@ entry:
   ret <8 x half> %fmla3
 }
 
-define dso_local <4 x half> @t_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
+define <4 x half> @t_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfma_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[0]
@@ -43,7 +43,7 @@ entry:
   ret <4 x half> %0
 }
 
-define dso_local <8 x half> @t_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
+define <8 x half> @t_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmaq_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[0]
@@ -54,7 +54,7 @@ entry:
   ret <8 x half> %0
 }
 
-define dso_local <4 x half> @t_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
+define <4 x half> @t_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfma_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -67,7 +67,7 @@ entry:
   ret <4 x half> %0
 }
 
-define dso_local <8 x half> @t_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
+define <8 x half> @t_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfmaq_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -80,7 +80,7 @@ entry:
   ret <8 x half> %0
 }
 
-define dso_local half @t_vfmah_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmah_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_lane_f16_0:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -92,7 +92,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmah_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_lane_f16_0_swap:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -104,7 +104,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmah_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_lane_f16_3:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -116,7 +116,35 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmah_lane_f16_3_0(half %a, <4 x half> %c) {
+; CHECK-LABEL: t_vfmah_lane_f16_3_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov h2, v1.h[3]
+; CHECK-NEXT:    fmadd h0, h1, h2, h0
+; CHECK-NEXT:    ret
+entry:
+  %b = extractelement <4 x half> %c, i32 0
+  %extract = extractelement <4 x half> %c, i32 3
+  %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
+  ret half %0
+}
+
+define half @t_vfmah_lane_f16_0_0(half %a, <4 x half> %b, <4 x half> %c) {
+; CHECK-LABEL: t_vfmah_lane_f16_0_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmadd h0, h1, h2, h0
+; CHECK-NEXT:    ret
+entry:
+  %b0 = extractelement <4 x half> %b, i32 0
+  %c0 = extractelement <4 x half> %c, i32 0
+  %0 = tail call half @llvm.fma.f16(half %b0, half %c0, half %a)
+  ret half %0
+}
+
+define half @t_vfmah_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_laneq_f16_0:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmadd h0, h1, h2, h0
@@ -127,7 +155,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmah_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_laneq_f16_0_swap:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmadd h0, h2, h1, h0
@@ -138,7 +166,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmah_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_laneq_f16_7:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla h0, h1, v2.h[7]
@@ -149,7 +177,7 @@ entry:
   ret half %0
 }
 
-define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
+define <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfms_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -162,7 +190,7 @@ entry:
   ret <4 x half> %fmla3
 }
 
-define dso_local <8 x half> @t_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
+define <8 x half> @t_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsq_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -175,7 +203,7 @@ entry:
   ret <8 x half> %fmla3
 }
 
-define dso_local <4 x half> @t_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
+define <4 x half> @t_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfms_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[0]
@@ -187,7 +215,7 @@ entry:
   ret <4 x half> %0
 }
 
-define dso_local <8 x half> @t_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
+define <8 x half> @t_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsq_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[0]
@@ -199,7 +227,7 @@ entry:
   ret <8 x half> %0
 }
 
-define dso_local <4 x half> @t_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
+define <4 x half> @t_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfms_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -213,7 +241,7 @@ entry:
   ret <4 x half> %0
 }
 
-define dso_local <8 x half> @t_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
+define <8 x half> @t_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfmsq_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -227,7 +255,7 @@ entry:
   ret <8 x half> %0
 }
 
-define dso_local half @t_vfmsh_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmsh_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_lane_f16_0:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -240,7 +268,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmsh_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_lane_f16_0_swap:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -253,7 +281,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmsh_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_lane_f16_3:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -266,7 +294,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmsh_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_laneq_f16_0:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmsub h0, h2, h1, h0
@@ -278,7 +306,22 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmsh_lane_f16_0_3(half %a, <4 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmsh_lane_f16_0_3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov h2, v1.h[3]
+; CHECK-NEXT:    fmsub h0, h2, h1, h0
+; CHECK-NEXT:    ret
+entry:
+  %b = extractelement <4 x half> %c, i32 0
+  %0 = fsub half 0xH8000, %b
+  %extract = extractelement <4 x half> %c, i32 3
+  %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
+  ret half %1
+}
+
+define half @t_vfmsh_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_laneq_f16_0_swap:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmsub h0, h2, h1, h0
@@ -290,7 +333,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmsh_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_laneq_f16_7:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls h0, h1, v2.h[7]
@@ -302,7 +345,7 @@ entry:
   ret half %1
 }
 
-define dso_local <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
+define <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmul_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmul v0.4h, v0.4h, v1.h[0]
@@ -313,7 +356,7 @@ entry:
   ret <4 x half> %mul
 }
 
-define dso_local <8 x half> @t_vmulq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
+define <8 x half> @t_vmulq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulq_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmul v0.8h, v0.8h, v1.h[0]
@@ -324,7 +367,7 @@ entry:
   ret <8 x half> %mul
 }
 
-define dso_local half @t_vmulh_lane0_f16(half %a, <4 x half> %c, i32 %lane) {
+define half @t_vmulh_lane0_f16(half %a, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_lane0_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -336,7 +379,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vmulh_lane3_f16(half %a, <4 x half> %c, i32 %lane) {
+define half @t_vmulh_lane3_f16(half %a, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_lane3_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -348,7 +391,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vmulh_laneq0_f16(half %a, <8 x half> %c, i32 %lane) {
+define half @t_vmulh_laneq0_f16(half %a, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_laneq0_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmul h0, h0, h1
@@ -359,7 +402,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vmulh_laneq7_f16(half %a, <8 x half> %c, i32 %lane) {
+define half @t_vmulh_laneq7_f16(half %a, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_laneq7_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmul h0, h0, v1.h[7]
@@ -370,7 +413,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vmulx_f16(half %a, half %b) {
+define half @t_vmulx_f16(half %a, half %b) {
 ; CHECK-LABEL: t_vmulx_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx h0, h0, h1
@@ -380,7 +423,7 @@ entry:
   ret half %fmulx.i
 }
 
-define dso_local half @t_vmulxh_lane0_f16(half %a, <4 x half> %b) {
+define half @t_vmulxh_lane0_f16(half %a, <4 x half> %b) {
 ; CHECK-LABEL: t_vmulxh_lane0_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -392,7 +435,7 @@ entry:
   ret half %fmulx.i
 }
 
-define dso_local half @t_vmulxh_lane3_f16(half %a, <4 x half> %b, i32 %lane) {
+define half @t_vmulxh_lane3_f16(half %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxh_lane3_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -404,7 +447,7 @@ entry:
   ret half %fmulx.i
 }
 
-define dso_local <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) {
+define <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulx_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -416,7 +459,7 @@ entry:
   ret <4 x half> %vmulx2.i
 }
 
-define dso_local <8 x half> @t_vmulxq_lane_f16(<8 x half> %a, <4 x half> %b, i32 %lane) {
+define <8 x half> @t_vmulxq_lane_f16(<8 x half> %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxq_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -428,7 +471,7 @@ entry:
   ret <8 x half> %vmulx2.i
 }
 
-define dso_local <4 x half> @t_vmulx_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
+define <4 x half> @t_vmulx_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulx_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx v0.4h, v0.4h, v1.h[0]
@@ -439,7 +482,7 @@ entry:
   ret <4 x half> %vmulx2.i
 }
 
-define dso_local <8 x half> @t_vmulxq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
+define <8 x half> @t_vmulxq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxq_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx v0.8h, v0.8h, v1.h[0]
@@ -450,7 +493,7 @@ entry:
   ret <8 x half> %vmulx2.i
 }
 
-define dso_local half @t_vmulxh_laneq0_f16(half %a, <8 x half> %b) {
+define half @t_vmulxh_laneq0_f16(half %a, <8 x half> %b) {
 ; CHECK-LABEL: t_vmulxh_laneq0_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx h0, h0, h1
@@ -461,7 +504,7 @@ entry:
   ret half %fmulx.i
 }
 
-define dso_local half @t_vmulxh_laneq7_f16(half %a, <8 x half> %b, i32 %lane) {
+define half @t_vmulxh_laneq7_f16(half %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxh_laneq7_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx h0, h0, v1.h[7]
@@ -472,7 +515,7 @@ entry:
   ret half %fmulx.i
 }
 
-define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) {
+define <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) {
 ; CHECK-LABEL: t_vmulx_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
@@ -486,7 +529,7 @@ entry:
   ret <4 x half> %vmulx2.i
 }
 
-define dso_local <8 x half> @t_vmulxq_n_f16(<8 x half> %a, half %c) {
+define <8 x half> @t_vmulxq_n_f16(<8 x half> %a, half %c) {
 ; CHECK-LABEL: t_vmulxq_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
@@ -500,7 +543,7 @@ entry:
   ret <8 x half> %vmulx2.i
 }
 
-define dso_local half @t_vfmah_lane3_f16(half %a, half %b, <4 x half> %c) {
+define half @t_vfmah_lane3_f16(half %a, half %b, <4 x half> %c) {
 ; CHECK-LABEL: t_vfmah_lane3_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -512,7 +555,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_laneq7_f16(half %a, half %b, <8 x half> %c) {
+define half @t_vfmah_laneq7_f16(half %a, half %b, <8 x half> %c) {
 ; CHECK-LABEL: t_vfmah_laneq7_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla h0, h1, v2.h[7]
@@ -523,7 +566,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmsh_lane3_f16(half %a, half %b, <4 x half> %c) {
+define half @t_vfmsh_lane3_f16(half %a, half %b, <4 x half> %c) {
 ; CHECK-LABEL: t_vfmsh_lane3_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -536,7 +579,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_laneq7_f16(half %a, half %b, <8 x half> %c) {
+define half @t_vfmsh_laneq7_f16(half %a, half %b, <8 x half> %c) {
 ; CHECK-LABEL: t_vfmsh_laneq7_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls h0, h1, v2.h[7]
@@ -548,7 +591,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_fadd_vfmah_f16(half %a, half %b, <4 x half> %c, <4 x half> %d) {
+define half @t_fadd_vfmah_f16(half %a, half %b, <4 x half> %c, <4 x half> %d) {
 ; CHECK-LABEL: t_fadd_vfmah_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fadd v2.4h, v2.4h, v3.4h

diff  --git a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index ed88293fcf7e34..b2ea6ff200be1d 100644
--- a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 attributes #0 = { strictfp }
@@ -8,112 +9,252 @@ declare float @llvm.experimental.constrained.fma.f32(float, float, float, metada
 declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
 
 define float @test_fmla_ss4S_0(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss4S_0
-  ; CHECK: fmadd s0, s1, s2, s0
+; CHECK-LABEL: test_fmla_ss4S_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 0
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_0_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss4S_0_swap
-  ; CHECK: fmadd s0, s2, s1, s0
+; CHECK-LABEL: test_fmla_ss4S_0_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 0
   %tmp2 = call float @llvm.fma.f32(float %tmp1, float %b, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_3(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss4S_3
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmla_ss4S_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_3_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss4S_3_swap
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmla_ss4S_3_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla s0, s0, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_0(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss2S_0
-  ; CHECK: fmadd s0, s1, s2, s0
+; CHECK-LABEL: test_fmla_ss2S_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 0
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_0_swap(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss2S_0_swap
-  ; CHECK: fmadd s0, s2, s1, s0
+; CHECK-LABEL: test_fmla_ss2S_0_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 0
   %tmp2 = call float @llvm.fma.f32(float %tmp1, float %b, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_1(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss2S_1
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_fmla_ss2S_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
 }
 
+define float @test_fmla_ss4S_3_ext0(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmla_ss4S_3_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s2, v1.s[3]
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <4 x float> %v, i32 0
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.fma.f32(float %tmp0, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_3_ext0_swp(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmla_ss4S_3_ext0_swp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s2, v1.s[3]
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <4 x float> %v, i32 0
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.fma.f32(float %tmp1, float %tmp0, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_0_ext0(float %a, <4 x float> %v, <4 x float> %w) {
+; CHECK-LABEL: test_fmla_ss4S_0_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <4 x float> %v, i32 0
+  %tmp1 = extractelement <4 x float> %w, i32 0
+  %tmp2 = call float @llvm.fma.f32(float %tmp0, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_3_ext0(float %a, <2 x float> %v) {
+; CHECK-LABEL: test_fmla_ss2S_3_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov s2, v1.s[1]
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x float> %v, i32 0
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.fma.f32(float %tmp0, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_3_ext0_swp(float %a, <2 x float> %v) {
+; CHECK-LABEL: test_fmla_ss2S_3_ext0_swp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov s2, v1.s[1]
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x float> %v, i32 0
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.fma.f32(float %tmp1, float %tmp0, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_0_ext0(float %a, <2 x float> %v, <2 x float> %w) {
+; CHECK-LABEL: test_fmla_ss2S_0_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x float> %v, i32 0
+  %tmp1 = extractelement <2 x float> %w, i32 0
+  %tmp2 = call float @llvm.fma.f32(float %tmp0, float %tmp1, float %a)
+  ret float %tmp2
+}
+
 define double @test_fmla_ddD_0(double %a, double %b, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmla_ddD_0
-  ; CHECK: fmadd d0, d1, d2, d0
+; CHECK-LABEL: test_fmla_ddD_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_ddD_0_swap(double %a, double %b, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmla_ddD_0_swap
-  ; CHECK: fmadd d0, d2, d1, d0
+; CHECK-LABEL: test_fmla_ddD_0_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d2, d1, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_0(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmla_dd2D_0
-  ; CHECK: fmadd d0, d1, d2, d0
+; CHECK-LABEL: test_fmla_dd2D_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_0_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmla_dd2D_0_swap
-  ; CHECK: fmadd d0, d2, d1, d0
+; CHECK-LABEL: test_fmla_dd2D_0_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d2, d1, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_1(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmla_dd2D_1
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmla_dd2D_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmla_dd2D_1_swap
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmla_dd2D_1_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
   ret double %tmp2
 }
 
+define double @test_fmla_ss2D_1_ext0(double %a, <2 x double> %v) {
+; CHECK-LABEL: test_fmla_ss2D_1_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x double> %v, i32 0
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.fma.f64(double %tmp0, double %tmp1, double %a)
+  ret double %tmp2
+}
+
+define double @test_fmla_ss2D_1_ext0_swp(double %a, <2 x double> %v) {
+; CHECK-LABEL: test_fmla_ss2D_1_ext0_swp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    fmadd d0, d2, d1, d0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x double> %v, i32 0
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.fma.f64(double %tmp1, double %tmp0, double %a)
+  ret double %tmp2
+}
+
+define double @test_fmla_ss2D_0_ext0(double %a, <2 x double> %v, <2 x double> %w) {
+; CHECK-LABEL: test_fmla_ss2D_0_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x double> %v, i32 0
+  %tmp1 = extractelement <2 x double> %w, i32 0
+  %tmp2 = call double @llvm.fma.f64(double %tmp0, double %tmp1, double %a)
+  ret double %tmp2
+}
+
 define float @test_fmls_ss4S_0(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss4S_0
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss4S_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <4 x float> %v, i64 0
@@ -122,8 +263,10 @@ entry:
 }
 
 define float @test_fmls_ss4S_0_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss4S_0_swap
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss4S_0_swap:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <4 x float> %v, i64 0
@@ -132,8 +275,11 @@ entry:
 }
 
 define float @test_fmls_ss4S_3(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss4S_3
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmls_ss4S_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v2.s[3]
+; CHECK-NEXT:    fmls s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
   %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a)
@@ -141,18 +287,23 @@ define float @test_fmls_ss4S_3(float %a, float %b, <4 x float> %v) {
 }
 
 define float @test_fmls_ss4S_3_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss4S_3_swap
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmls_ss4S_3_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v2.s[3]
+; CHECK-NEXT:    fmls s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
   %tmp3 = call float @llvm.fma.f32(float %tmp1, float %tmp2, float %a)
   ret float %tmp3
 }
 
-
 define float @test_fmls_ss2S_0(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss2S_0
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss2S_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <2 x float> %v, i64 0
@@ -161,8 +312,11 @@ entry:
 }
 
 define float @test_fmls_ss2S_0_swap(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss2S_0_swap
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss2S_0_swap:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <2 x float> %v, i64 0
@@ -171,17 +325,48 @@ entry:
 }
 
 define float @test_fmls_ss2S_1(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss2S_1
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_fmls_ss2S_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov s1, v2.s[1]
+; CHECK-NEXT:    fmls s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fsub float -0.0, %tmp1
   %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a)
   ret float %tmp3
 }
 
+define float @test_fmls_ss4S_3_ext0(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmls_ss4S_3_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s2, v1.s[3]
+; CHECK-NEXT:    fmsub s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <4 x float> %v, i32 0
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = call float @llvm.fma.f32(float %tmp0, float %tmp2, float %a)
+  ret float %tmp3
+}
+
+define float @test_fmls_ss4S_0_ext0(float %a, <4 x float> %v, <4 x float> %w) {
+; CHECK-LABEL: test_fmls_ss4S_0_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmsub s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <4 x float> %v, i32 0
+  %tmp1 = extractelement <4 x float> %w, i32 0
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = call float @llvm.fma.f32(float %tmp0, float %tmp2, float %a)
+  ret float %tmp3
+}
+
 define double @test_fmls_ddD_0(double %a, double %b, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmls_ddD_0
-  ; CHECK: fmsub d0, d1, d2, d0
+; CHECK-LABEL: test_fmls_ddD_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d1, d2, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <1 x double> %v, i64 0
@@ -190,8 +375,10 @@ entry:
 }
 
 define double @test_fmls_ddD_0_swap(double %a, double %b, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmls_ddD_0_swap
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_ddD_0_swap:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <1 x double> %v, i64 0
@@ -200,8 +387,10 @@ entry:
 }
 
 define double @test_fmls_dd2D_0(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmls_dd2D_0
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_dd2D_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <2 x double> %v, i64 0
@@ -210,8 +399,10 @@ entry:
 }
 
 define double @test_fmls_dd2D_0_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmls_dd2D_0_swap
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_dd2D_0_swap:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <2 x double> %v, i64 0
@@ -220,8 +411,11 @@ entry:
 }
 
 define double @test_fmls_dd2D_1(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmls_dd2D_1
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmls_dd2D_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d1, v2.d[1]
+; CHECK-NEXT:    fmls d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
   %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
@@ -229,121 +423,180 @@ define double @test_fmls_dd2D_1(double %a, double %b, <2 x double> %v) {
 }
 
 define double @test_fmls_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmls_dd2D_1_swap
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmls_dd2D_1_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d1, v2.d[1]
+; CHECK-NEXT:    fmls d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
   %tmp3 = call double @llvm.fma.f64(double %tmp1, double %tmp2, double %a)
   ret double %tmp3
 }
 
+define double @test_fmls_dd2D_1_ext0(double %a, <2 x double> %v) {
+; CHECK-LABEL: test_fmls_dd2D_1_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    fmsub d0, d1, d2, d0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x double> %v, i32 0
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fsub double -0.0, %tmp1
+  %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp0, double %a)
+  ret double %tmp3
+}
+
+define double @test_fmls_dd2D_0_ext0(double %a, <2 x double> %v, <2 x double> %w) {
+; CHECK-LABEL: test_fmls_dd2D_0_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmsub d0, d1, d2, d0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x double> %v, i32 0
+  %tmp1 = extractelement <2 x double> %w, i32 0
+  %tmp2 = fsub double -0.0, %tmp1
+  %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp0, double %a)
+  ret double %tmp3
+}
+
 define float @test_fmla_ss4S_0_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss4S_0_strict
-  ; CHECK: fmadd s0, s1, s2, s0
+; CHECK-LABEL: test_fmla_ss4S_0_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 0
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_0_swap_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss4S_0_swap_strict
-  ; CHECK: fmadd s0, s2, s1, s0
+; CHECK-LABEL: test_fmla_ss4S_0_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 0
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %b, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_3_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss4S_3_strict
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmla_ss4S_3_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_3_swap_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss4S_3_swap_strict
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmla_ss4S_3_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla s0, s0, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %a, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_0_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss2S_0_strict
-  ; CHECK: fmadd s0, s1, s2, s0
+; CHECK-LABEL: test_fmla_ss2S_0_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 0
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_0_swap_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss2S_0_swap_strict
-  ; CHECK: fmadd s0, s2, s1, s0
+; CHECK-LABEL: test_fmla_ss2S_0_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 0
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %b, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_1_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss2S_1_strict
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_fmla_ss2S_1_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define double @test_fmla_ddD_0_strict(double %a, double %b, <1 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ddD_0_strict
-  ; CHECK: fmadd d0, d1, d2, d0
+; CHECK-LABEL: test_fmla_ddD_0_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define double @test_fmla_ddD_0_swap_strict(double %a, double %b, <1 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ddD_0_swap_strict
-  ; CHECK: fmadd d0, d2, d1, d0
+; CHECK-LABEL: test_fmla_ddD_0_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d2, d1, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_0_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_dd2D_0_strict
-  ; CHECK: fmadd d0, d1, d2, d0
+; CHECK-LABEL: test_fmla_dd2D_0_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 0
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_0_swap_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_dd2D_0_swap_strict
-  ; CHECK: fmadd d0, d2, d1, d0
+; CHECK-LABEL: test_fmla_dd2D_0_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d2, d1, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 0
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_1_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_dd2D_1_strict
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmla_dd2D_1_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_1_swap_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_dd2D_1_swap_strict
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmla_dd2D_1_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define float @test_fmls_ss4S_0_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss4S_0_strict
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss4S_0_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <4 x float> %v, i64 0
@@ -352,8 +605,10 @@ entry:
 }
 
 define float @test_fmls_ss4S_0_swap_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss4S_0_swap_strict
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss4S_0_swap_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <4 x float> %v, i64 0
@@ -362,8 +617,11 @@ entry:
 }
 
 define float @test_fmls_ss4S_3_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss4S_3_strict
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmls_ss4S_3_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v2.s[3]
+; CHECK-NEXT:    fmls s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fneg float %tmp1
   %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp2, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -371,8 +629,11 @@ define float @test_fmls_ss4S_3_strict(float %a, float %b, <4 x float> %v) #0 {
 }
 
 define float @test_fmls_ss4S_3_swap_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss4S_3_swap_strict
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmls_ss4S_3_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v2.s[3]
+; CHECK-NEXT:    fmls s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fneg float %tmp1
   %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %tmp2, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -380,8 +641,11 @@ define float @test_fmls_ss4S_3_swap_strict(float %a, float %b, <4 x float> %v) #
 }
 
 define float @test_fmls_ss2S_0_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss2S_0_strict
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss2S_0_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <2 x float> %v, i64 0
@@ -390,8 +654,11 @@ entry:
 }
 
 define float @test_fmls_ss2S_0_swap_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss2S_0_swap_strict
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss2S_0_swap_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <2 x float> %v, i64 0
@@ -400,8 +667,12 @@ entry:
 }
 
 define float @test_fmls_ss2S_1_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss2S_1_strict
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_fmls_ss2S_1_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov s1, v2.s[1]
+; CHECK-NEXT:    fmls s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fneg float %tmp1
   %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp2, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -409,8 +680,10 @@ define float @test_fmls_ss2S_1_strict(float %a, float %b, <2 x float> %v) #0 {
 }
 
 define double @test_fmls_ddD_0_strict(double %a, double %b, <1 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ddD_0_strict
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_ddD_0_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <1 x double> %v, i64 0
@@ -419,8 +692,10 @@ entry:
 }
 
 define double @test_fmls_ddD_0_swap_strict(double %a, double %b, <1 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ddD_0_swap_strict
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_ddD_0_swap_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <1 x double> %v, i64 0
@@ -429,8 +704,10 @@ entry:
 }
 
 define double @test_fmls_dd2D_0_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_dd2D_0_strict
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_dd2D_0_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <2 x double> %v, i64 0
@@ -439,8 +716,10 @@ entry:
 }
 
 define double @test_fmls_dd2D_0_swap_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_dd2D_0_swap_strict
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_dd2D_0_swap_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <2 x double> %v, i64 0
@@ -449,8 +728,11 @@ entry:
 }
 
 define double @test_fmls_dd2D_1_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_dd2D_1_strict
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmls_dd2D_1_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d1, v2.d[1]
+; CHECK-NEXT:    fmls d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fneg double %tmp1
   %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp2, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -458,8 +740,11 @@ define double @test_fmls_dd2D_1_strict(double %a, double %b, <2 x double> %v) #0
 }
 
 define double @test_fmls_dd2D_1_swap_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_dd2D_1_swap_strict
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmls_dd2D_1_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d1, v2.d[1]
+; CHECK-NEXT:    fmls d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fneg double %tmp1
   %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %tmp2, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")