[llvm] r320689 - [X86] Add FMA3 schedule tests

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 14 03:30:01 PST 2017


Author: rksimon
Date: Thu Dec 14 03:30:01 2017
New Revision: 320689

URL: http://llvm.org/viewvc/llvm-project?rev=320689&view=rev
Log:
[X86] Add FMA3 schedule tests

Rewrote to use inline asm for full coverage

Modified:
    llvm/trunk/test/CodeGen/X86/fma-schedule.ll

Modified: llvm/trunk/test/CodeGen/X86/fma-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fma-schedule.ll?rev=320689&r1=320688&r2=320689&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fma-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fma-schedule.ll Thu Dec 14 03:30:01 2017
@@ -8,1648 +8,2913 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 ;
-; VFMADD132 (TODO)
+; VFMADD
 ;
 
-;
-; VFMADD213
-;
-
-define <2 x double> @test_vfmadd213pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
-; GENERIC-LABEL: test_vfmadd213pd:
+define void @test_vfmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddpd_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmadd213pd:
+; HASWELL-LABEL: test_vfmaddpd_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmadd213pd:
+; BROADWELL-LABEL: test_vfmaddpd_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmadd213pd:
+; SKYLAKE-LABEL: test_vfmaddpd_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmadd213pd:
+; KNL-LABEL: test_vfmaddpd_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmadd213pd:
+; SKX-LABEL: test_vfmaddpd_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmadd213pd:
+; ZNVER1-LABEL: test_vfmaddpd_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
-  %2 = load <2 x double>, <2 x double> *%a3
-  %3 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
-  ret <2 x double> %3
+  tail call void asm "vfmadd132pd $2, $1, $0 \0A\09 vfmadd213pd $2, $1, $0 \0A\09 vfmadd231pd $2, $1, $0 \0A\09 vfmadd132pd $3, $1, $0 \0A\09 vfmadd213pd $3, $1, $0 \0A\09 vfmadd231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x double> @test_vfmadd213pd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
-; GENERIC-LABEL: test_vfmadd213pd_ymm:
+define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddpd_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmadd213pd_ymm:
+; HASWELL-LABEL: test_vfmaddpd_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmadd213pd_ymm:
+; BROADWELL-LABEL: test_vfmaddpd_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmadd213pd_ymm:
+; SKYLAKE-LABEL: test_vfmaddpd_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmadd213pd_ymm:
+; KNL-LABEL: test_vfmaddpd_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmadd213pd_ymm:
+; SKX-LABEL: test_vfmaddpd_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmadd213pd_ymm:
+; ZNVER1-LABEL: test_vfmaddpd_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
-  %2 = load <4 x double>, <4 x double> *%a3
-  %3 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
-  ret <4 x double> %3
+  tail call void asm "vfmadd132pd $2, $1, $0 \0A\09 vfmadd213pd $2, $1, $0 \0A\09 vfmadd231pd $2, $1, $0 \0A\09 vfmadd132pd $3, $1, $0 \0A\09 vfmadd213pd $3, $1, $0 \0A\09 vfmadd231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x float> @test_vfmadd213ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
-; GENERIC-LABEL: test_vfmadd213ps:
+define void @test_vfmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddps_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmadd213ps:
+; HASWELL-LABEL: test_vfmaddps_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmadd213ps:
+; BROADWELL-LABEL: test_vfmaddps_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmadd213ps:
+; SKYLAKE-LABEL: test_vfmaddps_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmadd213ps:
+; KNL-LABEL: test_vfmaddps_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmadd213ps:
+; SKX-LABEL: test_vfmaddps_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmadd213ps:
+; ZNVER1-LABEL: test_vfmaddps_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
-  %2 = load <4 x float>, <4 x float> *%a3
-  %3 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
-  ret <4 x float> %3
+  tail call void asm "vfmadd132ps $2, $1, $0 \0A\09 vfmadd213ps $2, $1, $0 \0A\09 vfmadd231ps $2, $1, $0 \0A\09 vfmadd132ps $3, $1, $0 \0A\09 vfmadd213ps $3, $1, $0 \0A\09 vfmadd231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+  ret void
 }
 
-define <8 x float> @test_vfmadd213ps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
-; GENERIC-LABEL: test_vfmadd213ps_ymm:
+define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddps_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmadd213ps_ymm:
+; HASWELL-LABEL: test_vfmaddps_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmadd213ps_ymm:
+; BROADWELL-LABEL: test_vfmaddps_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmadd213ps_ymm:
+; SKYLAKE-LABEL: test_vfmaddps_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmadd213ps_ymm:
+; KNL-LABEL: test_vfmaddps_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmadd213ps_ymm:
+; SKX-LABEL: test_vfmaddps_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmadd213ps_ymm:
+; ZNVER1-LABEL: test_vfmaddps_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
-  %2 = load <8 x float>, <8 x float> *%a3
-  %3 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
-  ret <8 x float> %3
+  tail call void asm "vfmadd132ps $2, $1, $0 \0A\09 vfmadd213ps $2, $1, $0 \0A\09 vfmadd231ps $2, $1, $0 \0A\09 vfmadd132ps $3, $1, $0 \0A\09 vfmadd213ps $3, $1, $0 \0A\09 vfmadd231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+  ret void
 }
 
-define <2 x double> @test_vfmadd213sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
-; GENERIC-LABEL: test_vfmadd213sd:
+define void @test_vfmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsd_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmadd213sd:
+; HASWELL-LABEL: test_vfmaddsd_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; HASWELL-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmadd213sd:
+; BROADWELL-LABEL: test_vfmaddsd_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmadd213sd:
+; SKYLAKE-LABEL: test_vfmaddsd_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKYLAKE-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmadd213sd:
+; KNL-LABEL: test_vfmaddsd_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; KNL-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmadd213sd:
+; SKX-LABEL: test_vfmaddsd_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmadd213sd:
+; ZNVER1-LABEL: test_vfmaddsd_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
-  %2 = load <2 x double>, <2 x double> *%a3
-  %3 = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
-  ret <2 x double> %3
+  tail call void asm "vfmadd132sd $2, $1, $0 \0A\09 vfmadd213sd $2, $1, $0 \0A\09 vfmadd231sd $2, $1, $0 \0A\09 vfmadd132sd $3, $1, $0 \0A\09 vfmadd213sd $3, $1, $0 \0A\09 vfmadd231sd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x float> @test_vfmadd213ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
-; GENERIC-LABEL: test_vfmadd213ss:
+define void @test_vfmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddss_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmadd213ss:
+; HASWELL-LABEL: test_vfmaddss_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; HASWELL-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmadd213ss:
+; BROADWELL-LABEL: test_vfmaddss_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmadd213ss:
+; SKYLAKE-LABEL: test_vfmaddss_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKYLAKE-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmadd213ss:
+; KNL-LABEL: test_vfmaddss_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; KNL-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmadd213ss:
+; SKX-LABEL: test_vfmaddss_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmadd213ss:
+; ZNVER1-LABEL: test_vfmaddss_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
-  %2 = load <4 x float>, <4 x float> *%a3
-  %3 = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
-  ret <4 x float> %3
+  tail call void asm "vfmadd132ss $2, $1, $0 \0A\09 vfmadd213ss $2, $1, $0 \0A\09 vfmadd231ss $2, $1, $0 \0A\09 vfmadd132ss $3, $1, $0 \0A\09 vfmadd213ss $3, $1, $0 \0A\09 vfmadd231ss $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+  ret void
 }
 
 ;
-; VFMADD231 (TODO)
-;
-
-;
-; VFMADDSUB132 (TODO)
+; VFMADDSUB
 ;
 
-;
-; VFMADDSUB213
-;
-
-define <2 x double> @test_vfmaddsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
-; GENERIC-LABEL: test_vfmaddsubpd:
+define void @test_vfmaddsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubpd_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmaddsubpd:
+; HASWELL-LABEL: test_vfmaddsubpd_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmaddsubpd:
+; BROADWELL-LABEL: test_vfmaddsubpd_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmaddsubpd:
+; SKYLAKE-LABEL: test_vfmaddsubpd_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmaddsubpd:
+; KNL-LABEL: test_vfmaddsubpd_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmaddsubpd:
+; SKX-LABEL: test_vfmaddsubpd_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmaddsubpd:
+; ZNVER1-LABEL: test_vfmaddsubpd_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
-  %2 = load <2 x double>, <2 x double> *%a3
-  %3 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
-  ret <2 x double> %3
+  tail call void asm "vfmaddsub132pd $2, $1, $0 \0A\09 vfmaddsub213pd $2, $1, $0 \0A\09 vfmaddsub231pd $2, $1, $0 \0A\09 vfmaddsub132pd $3, $1, $0 \0A\09 vfmaddsub213pd $3, $1, $0 \0A\09 vfmaddsub231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x double> @test_vfmaddsubpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a4, <4 x double> *%a3) {
-; GENERIC-LABEL: test_vfmaddsubpd_ymm:
+define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubpd_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmaddsubpd_ymm:
+; HASWELL-LABEL: test_vfmaddsubpd_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmaddsubpd_ymm:
+; BROADWELL-LABEL: test_vfmaddsubpd_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmaddsubpd_ymm:
+; SKYLAKE-LABEL: test_vfmaddsubpd_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmaddsubpd_ymm:
+; KNL-LABEL: test_vfmaddsubpd_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmaddsubpd_ymm:
+; SKX-LABEL: test_vfmaddsubpd_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmaddsubpd_ymm:
+; ZNVER1-LABEL: test_vfmaddsubpd_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a4)
-  %2 = load <4 x double>, <4 x double> *%a3
-  %3 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
-  ret <4 x double> %3
+  tail call void asm "vfmaddsub132pd $2, $1, $0 \0A\09 vfmaddsub213pd $2, $1, $0 \0A\09 vfmaddsub231pd $2, $1, $0 \0A\09 vfmaddsub132pd $3, $1, $0 \0A\09 vfmaddsub213pd $3, $1, $0 \0A\09 vfmaddsub231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x float> @test_vfmaddsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a4, <4 x float> *%a3) {
-; GENERIC-LABEL: test_vfmaddsubps:
+define void @test_vfmaddsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubps_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmaddsubps:
+; HASWELL-LABEL: test_vfmaddsubps_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmaddsubps:
+; BROADWELL-LABEL: test_vfmaddsubps_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmaddsubps:
+; SKYLAKE-LABEL: test_vfmaddsubps_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmaddsubps:
+; KNL-LABEL: test_vfmaddsubps_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmaddsubps:
+; SKX-LABEL: test_vfmaddsubps_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmaddsubps:
+; ZNVER1-LABEL: test_vfmaddsubps_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a4)
-  %2 = load <4 x float>, <4 x float> *%a3
-  %3 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
-  ret <4 x float> %3
+  tail call void asm "vfmaddsub132ps $2, $1, $0 \0A\09 vfmaddsub213ps $2, $1, $0 \0A\09 vfmaddsub231ps $2, $1, $0 \0A\09 vfmaddsub132ps $3, $1, $0 \0A\09 vfmaddsub213ps $3, $1, $0 \0A\09 vfmaddsub231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+  ret void
 }
 
-define <8 x float> @test_vfmaddsubps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a8, <8 x float> *%a3) {
-; GENERIC-LABEL: test_vfmaddsubps_ymm:
+define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubps_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmaddsubps_ymm:
+; HASWELL-LABEL: test_vfmaddsubps_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmaddsubps_ymm:
+; BROADWELL-LABEL: test_vfmaddsubps_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmaddsubps_ymm:
+; SKYLAKE-LABEL: test_vfmaddsubps_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmaddsubps_ymm:
+; KNL-LABEL: test_vfmaddsubps_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmaddsubps_ymm:
+; SKX-LABEL: test_vfmaddsubps_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmaddsubps_ymm:
+; ZNVER1-LABEL: test_vfmaddsubps_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a8)
-  %2 = load <8 x float>, <8 x float> *%a3
-  %3 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
-  ret <8 x float> %3
+  tail call void asm "vfmaddsub132ps $2, $1, $0 \0A\09 vfmaddsub213ps $2, $1, $0 \0A\09 vfmaddsub231ps $2, $1, $0 \0A\09 vfmaddsub132ps $3, $1, $0 \0A\09 vfmaddsub213ps $3, $1, $0 \0A\09 vfmaddsub231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+  ret void
 }
 
 ;
-; VFMADDSUB231 (TODO)
+; VFMSUBADD
 ;
 
-;
-; VFMSUBADD132 (TODO)
-;
-
-;
-; VFMSUBADD213
-;
-
-define <2 x double> @test_vfmsubaddpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
-; GENERIC-LABEL: test_vfmsubaddpd:
+define void @test_vfmsubaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddpd_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmsubaddpd:
+; HASWELL-LABEL: test_vfmsubaddpd_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmsubaddpd:
+; BROADWELL-LABEL: test_vfmsubaddpd_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmsubaddpd:
+; SKYLAKE-LABEL: test_vfmsubaddpd_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmsubaddpd:
+; KNL-LABEL: test_vfmsubaddpd_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmsubaddpd:
+; SKX-LABEL: test_vfmsubaddpd_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmsubaddpd:
+; ZNVER1-LABEL: test_vfmsubaddpd_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
-  %2 = load <2 x double>, <2 x double> *%a3
-  %3 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
-  ret <2 x double> %3
+  tail call void asm "vfmsubadd132pd $2, $1, $0 \0A\09 vfmsubadd213pd $2, $1, $0 \0A\09 vfmsubadd231pd $2, $1, $0 \0A\09 vfmsubadd132pd $3, $1, $0 \0A\09 vfmsubadd213pd $3, $1, $0 \0A\09 vfmsubadd231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x double> @test_vfmsubaddpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a4, <4 x double> *%a3) {
-; GENERIC-LABEL: test_vfmsubaddpd_ymm:
+define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddpd_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmsubaddpd_ymm:
+; HASWELL-LABEL: test_vfmsubaddpd_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmsubaddpd_ymm:
+; BROADWELL-LABEL: test_vfmsubaddpd_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmsubaddpd_ymm:
+; SKYLAKE-LABEL: test_vfmsubaddpd_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmsubaddpd_ymm:
+; KNL-LABEL: test_vfmsubaddpd_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmsubaddpd_ymm:
+; SKX-LABEL: test_vfmsubaddpd_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmsubaddpd_ymm:
+; ZNVER1-LABEL: test_vfmsubaddpd_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a4)
-  %2 = load <4 x double>, <4 x double> *%a3
-  %3 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
-  ret <4 x double> %3
+  tail call void asm "vfmsubadd132pd $2, $1, $0 \0A\09 vfmsubadd213pd $2, $1, $0 \0A\09 vfmsubadd231pd $2, $1, $0 \0A\09 vfmsubadd132pd $3, $1, $0 \0A\09 vfmsubadd213pd $3, $1, $0 \0A\09 vfmsubadd231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x float> @test_vfmsubaddps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a4, <4 x float> *%a3) {
-; GENERIC-LABEL: test_vfmsubaddps:
+define void @test_vfmsubaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddps_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmsubaddps:
+; HASWELL-LABEL: test_vfmsubaddps_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmsubaddps:
+; BROADWELL-LABEL: test_vfmsubaddps_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmsubaddps:
+; SKYLAKE-LABEL: test_vfmsubaddps_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmsubaddps:
+; KNL-LABEL: test_vfmsubaddps_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmsubaddps:
+; SKX-LABEL: test_vfmsubaddps_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmsubaddps:
+; ZNVER1-LABEL: test_vfmsubaddps_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a4)
-  %2 = load <4 x float>, <4 x float> *%a3
-  %3 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
-  ret <4 x float> %3
+  tail call void asm "vfmsubadd132ps $2, $1, $0 \0A\09 vfmsubadd213ps $2, $1, $0 \0A\09 vfmsubadd231ps $2, $1, $0 \0A\09 vfmsubadd132ps $3, $1, $0 \0A\09 vfmsubadd213ps $3, $1, $0 \0A\09 vfmsubadd231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+  ret void
 }
 
-define <8 x float> @test_vfmsubaddps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a8, <8 x float> *%a3) {
-; GENERIC-LABEL: test_vfmsubaddps_ymm:
+define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddps_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmsubaddps_ymm:
+; HASWELL-LABEL: test_vfmsubaddps_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmsubaddps_ymm:
+; BROADWELL-LABEL: test_vfmsubaddps_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmsubaddps_ymm:
+; SKYLAKE-LABEL: test_vfmsubaddps_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmsubaddps_ymm:
+; KNL-LABEL: test_vfmsubaddps_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmsubaddps_ymm:
+; SKX-LABEL: test_vfmsubaddps_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmsubaddps_ymm:
+; ZNVER1-LABEL: test_vfmsubaddps_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a8)
-  %2 = load <8 x float>, <8 x float> *%a3
-  %3 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
-  ret <8 x float> %3
+  tail call void asm "vfmsubadd132ps $2, $1, $0 \0A\09 vfmsubadd213ps $2, $1, $0 \0A\09 vfmsubadd231ps $2, $1, $0 \0A\09 vfmsubadd132ps $3, $1, $0 \0A\09 vfmsubadd213ps $3, $1, $0 \0A\09 vfmsubadd231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+  ret void
 }
 
 ;
-; VFMSUBADD231 (TODO)
-;
-
-;
-; VFMSUB132 (TODO)
-;
-
-;
-; VFMSUB213
+; VFMSUB
 ;
 
-define <2 x double> @test_vfmsub213pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
-; GENERIC-LABEL: test_vfmsub213pd:
+define void @test_vfmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubpd_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmsub213pd:
+; HASWELL-LABEL: test_vfmsubpd_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmsub213pd:
+; BROADWELL-LABEL: test_vfmsubpd_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmsub213pd:
+; SKYLAKE-LABEL: test_vfmsubpd_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmsub213pd:
+; KNL-LABEL: test_vfmsubpd_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmsub213pd:
+; SKX-LABEL: test_vfmsubpd_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmsub213pd:
+; ZNVER1-LABEL: test_vfmsubpd_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
-  %2 = load <2 x double>, <2 x double> *%a3
-  %3 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
-  ret <2 x double> %3
+  tail call void asm "vfmsub132pd $2, $1, $0 \0A\09 vfmsub213pd $2, $1, $0 \0A\09 vfmsub231pd $2, $1, $0 \0A\09 vfmsub132pd $3, $1, $0 \0A\09 vfmsub213pd $3, $1, $0 \0A\09 vfmsub231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x double> @test_vfmsub213pd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
-; GENERIC-LABEL: test_vfmsub213pd_ymm:
+define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubpd_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmsub213pd_ymm:
+; HASWELL-LABEL: test_vfmsubpd_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmsub213pd_ymm:
+; BROADWELL-LABEL: test_vfmsubpd_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmsub213pd_ymm:
+; SKYLAKE-LABEL: test_vfmsubpd_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmsub213pd_ymm:
+; KNL-LABEL: test_vfmsubpd_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmsub213pd_ymm:
+; SKX-LABEL: test_vfmsubpd_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmsub213pd_ymm:
+; ZNVER1-LABEL: test_vfmsubpd_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
-  %2 = load <4 x double>, <4 x double> *%a3
-  %3 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
-  ret <4 x double> %3
+  tail call void asm "vfmsub132pd $2, $1, $0 \0A\09 vfmsub213pd $2, $1, $0 \0A\09 vfmsub231pd $2, $1, $0 \0A\09 vfmsub132pd $3, $1, $0 \0A\09 vfmsub213pd $3, $1, $0 \0A\09 vfmsub231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x float> @test_vfmsub213ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
-; GENERIC-LABEL: test_vfmsub213ps:
+define void @test_vfmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubps_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmsub213ps:
+; HASWELL-LABEL: test_vfmsubps_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmsub213ps:
+; BROADWELL-LABEL: test_vfmsubps_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmsub213ps:
+; SKYLAKE-LABEL: test_vfmsubps_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmsub213ps:
+; KNL-LABEL: test_vfmsubps_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmsub213ps:
+; SKX-LABEL: test_vfmsubps_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmsub213ps:
+; ZNVER1-LABEL: test_vfmsubps_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
-  %2 = load <4 x float>, <4 x float> *%a3
-  %3 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
-  ret <4 x float> %3
+  tail call void asm "vfmsub132ps $2, $1, $0 \0A\09 vfmsub213ps $2, $1, $0 \0A\09 vfmsub231ps $2, $1, $0 \0A\09 vfmsub132ps $3, $1, $0 \0A\09 vfmsub213ps $3, $1, $0 \0A\09 vfmsub231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+  ret void
 }
 
-define <8 x float> @test_vfmsub213ps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
-; GENERIC-LABEL: test_vfmsub213ps_ymm:
+define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubps_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmsub213ps_ymm:
+; HASWELL-LABEL: test_vfmsubps_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmsub213ps_ymm:
+; BROADWELL-LABEL: test_vfmsubps_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmsub213ps_ymm:
+; SKYLAKE-LABEL: test_vfmsubps_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmsub213ps_ymm:
+; KNL-LABEL: test_vfmsubps_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmsub213ps_ymm:
+; SKX-LABEL: test_vfmsubps_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmsub213ps_ymm:
+; ZNVER1-LABEL: test_vfmsubps_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
-  %2 = load <8 x float>, <8 x float> *%a3
-  %3 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
-  ret <8 x float> %3
+  tail call void asm "vfmsub132ps $2, $1, $0 \0A\09 vfmsub213ps $2, $1, $0 \0A\09 vfmsub231ps $2, $1, $0 \0A\09 vfmsub132ps $3, $1, $0 \0A\09 vfmsub213ps $3, $1, $0 \0A\09 vfmsub231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+  ret void
 }
 
-define <2 x double> @test_vfmsub213sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
-; GENERIC-LABEL: test_vfmsub213sd:
+define void @test_vfmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubsd_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmsub213sd:
+; HASWELL-LABEL: test_vfmsubsd_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; HASWELL-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmsub213sd:
+; BROADWELL-LABEL: test_vfmsubsd_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmsub213sd:
+; SKYLAKE-LABEL: test_vfmsubsd_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKYLAKE-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmsub213sd:
+; KNL-LABEL: test_vfmsubsd_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; KNL-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmsub213sd:
+; SKX-LABEL: test_vfmsubsd_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmsub213sd:
+; ZNVER1-LABEL: test_vfmsubsd_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
-  %2 = load <2 x double>, <2 x double> *%a3
-  %3 = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
-  ret <2 x double> %3
+  tail call void asm "vfmsub132sd $2, $1, $0 \0A\09 vfmsub213sd $2, $1, $0 \0A\09 vfmsub231sd $2, $1, $0 \0A\09 vfmsub132sd $3, $1, $0 \0A\09 vfmsub213sd $3, $1, $0 \0A\09 vfmsub231sd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x float> @test_vfmsub213ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
-; GENERIC-LABEL: test_vfmsub213ss:
+define void @test_vfmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubss_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfmsub213ss:
+; HASWELL-LABEL: test_vfmsubss_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; HASWELL-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfmsub213ss:
+; BROADWELL-LABEL: test_vfmsubss_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfmsub213ss:
+; SKYLAKE-LABEL: test_vfmsubss_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKYLAKE-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfmsub213ss:
+; KNL-LABEL: test_vfmsubss_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; KNL-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfmsub213ss:
+; SKX-LABEL: test_vfmsubss_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfmsub213ss:
+; ZNVER1-LABEL: test_vfmsubss_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
-  %2 = load <4 x float>, <4 x float> *%a3
-  %3 = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
-  ret <4 x float> %3
+  tail call void asm "vfmsub132ss $2, $1, $0 \0A\09 vfmsub213ss $2, $1, $0 \0A\09 vfmsub231ss $2, $1, $0 \0A\09 vfmsub132ss $3, $1, $0 \0A\09 vfmsub213ss $3, $1, $0 \0A\09 vfmsub231ss $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+  ret void
 }
 
 ;
-; VFMSUB231 (TODO)
-;
-
-;
-; VFNMADD132 (TODO)
+; VFNMADD
 ;
 
-;
-; VFNMADD213
-;
-
-define <2 x double> @test_vfnmadd213pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
-; GENERIC-LABEL: test_vfnmadd213pd:
+define void @test_vfnmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddpd_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmadd213pd:
+; HASWELL-LABEL: test_vfnmaddpd_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmadd213pd:
+; BROADWELL-LABEL: test_vfnmaddpd_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmadd213pd:
+; SKYLAKE-LABEL: test_vfnmaddpd_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmadd213pd:
+; KNL-LABEL: test_vfnmaddpd_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmadd213pd:
+; SKX-LABEL: test_vfnmaddpd_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmadd213pd:
+; ZNVER1-LABEL: test_vfnmaddpd_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
-  %2 = load <2 x double>, <2 x double> *%a3
-  %3 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
-  ret <2 x double> %3
+  tail call void asm "vfnmadd132pd $2, $1, $0 \0A\09 vfnmadd213pd $2, $1, $0 \0A\09 vfnmadd231pd $2, $1, $0 \0A\09 vfnmadd132pd $3, $1, $0 \0A\09 vfnmadd213pd $3, $1, $0 \0A\09 vfnmadd231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x double> @test_vfnmadd213pd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
-; GENERIC-LABEL: test_vfnmadd213pd_ymm:
+define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddpd_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmadd213pd_ymm:
+; HASWELL-LABEL: test_vfnmaddpd_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmadd213pd_ymm:
+; BROADWELL-LABEL: test_vfnmaddpd_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmadd213pd_ymm:
+; SKYLAKE-LABEL: test_vfnmaddpd_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmadd213pd_ymm:
+; KNL-LABEL: test_vfnmaddpd_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmadd213pd_ymm:
+; SKX-LABEL: test_vfnmaddpd_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmadd213pd_ymm:
+; ZNVER1-LABEL: test_vfnmaddpd_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
-  %2 = load <4 x double>, <4 x double> *%a3
-  %3 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
-  ret <4 x double> %3
+  tail call void asm "vfnmadd132pd $2, $1, $0 \0A\09 vfnmadd213pd $2, $1, $0 \0A\09 vfnmadd231pd $2, $1, $0 \0A\09 vfnmadd132pd $3, $1, $0 \0A\09 vfnmadd213pd $3, $1, $0 \0A\09 vfnmadd231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x float> @test_vfnmadd213ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
-; GENERIC-LABEL: test_vfnmadd213ps:
+define void @test_vfnmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddps_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmadd213ps:
+; HASWELL-LABEL: test_vfnmaddps_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmadd213ps:
+; BROADWELL-LABEL: test_vfnmaddps_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmadd213ps:
+; SKYLAKE-LABEL: test_vfnmaddps_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmadd213ps:
+; KNL-LABEL: test_vfnmaddps_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmadd213ps:
+; SKX-LABEL: test_vfnmaddps_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmadd213ps:
+; ZNVER1-LABEL: test_vfnmaddps_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
-  %2 = load <4 x float>, <4 x float> *%a3
-  %3 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
-  ret <4 x float> %3
+  tail call void asm "vfnmadd132ps $2, $1, $0 \0A\09 vfnmadd213ps $2, $1, $0 \0A\09 vfnmadd231ps $2, $1, $0 \0A\09 vfnmadd132ps $3, $1, $0 \0A\09 vfnmadd213ps $3, $1, $0 \0A\09 vfnmadd231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+  ret void
 }
 
-define <8 x float> @test_vfnmadd213ps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
-; GENERIC-LABEL: test_vfnmadd213ps_ymm:
+define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddps_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmadd213ps_ymm:
+; HASWELL-LABEL: test_vfnmaddps_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmadd213ps_ymm:
+; BROADWELL-LABEL: test_vfnmaddps_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmadd213ps_ymm:
+; SKYLAKE-LABEL: test_vfnmaddps_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmadd213ps_ymm:
+; KNL-LABEL: test_vfnmaddps_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmadd213ps_ymm:
+; SKX-LABEL: test_vfnmaddps_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmadd213ps_ymm:
+; ZNVER1-LABEL: test_vfnmaddps_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
-  %2 = load <8 x float>, <8 x float> *%a3
-  %3 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
-  ret <8 x float> %3
+  tail call void asm "vfnmadd132ps $2, $1, $0 \0A\09 vfnmadd213ps $2, $1, $0 \0A\09 vfnmadd231ps $2, $1, $0 \0A\09 vfnmadd132ps $3, $1, $0 \0A\09 vfnmadd213ps $3, $1, $0 \0A\09 vfnmadd231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+  ret void
 }
 
-define <2 x double> @test_vfnmadd213sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
-; GENERIC-LABEL: test_vfnmadd213sd:
+define void @test_vfnmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddsd_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmadd213sd:
+; HASWELL-LABEL: test_vfnmaddsd_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; HASWELL-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmadd213sd:
+; BROADWELL-LABEL: test_vfnmaddsd_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmadd213sd:
+; SKYLAKE-LABEL: test_vfnmaddsd_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmadd213sd:
+; KNL-LABEL: test_vfnmaddsd_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; KNL-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmadd213sd:
+; SKX-LABEL: test_vfnmaddsd_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmadd213sd:
+; ZNVER1-LABEL: test_vfnmaddsd_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
-  %2 = load <2 x double>, <2 x double> *%a3
-  %3 = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
-  ret <2 x double> %3
+  tail call void asm "vfnmadd132sd $2, $1, $0 \0A\09 vfnmadd213sd $2, $1, $0 \0A\09 vfnmadd231sd $2, $1, $0 \0A\09 vfnmadd132sd $3, $1, $0 \0A\09 vfnmadd213sd $3, $1, $0 \0A\09 vfnmadd231sd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x float> @test_vfnmadd213ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
-; GENERIC-LABEL: test_vfnmadd213ss:
+define void @test_vfnmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddss_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmadd213ss:
+; HASWELL-LABEL: test_vfnmaddss_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; HASWELL-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmadd213ss:
+; BROADWELL-LABEL: test_vfnmaddss_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmadd213ss:
+; SKYLAKE-LABEL: test_vfnmaddss_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKYLAKE-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmadd213ss:
+; KNL-LABEL: test_vfnmaddss_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; KNL-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmadd213ss:
+; SKX-LABEL: test_vfnmaddss_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmadd213ss:
+; ZNVER1-LABEL: test_vfnmaddss_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
-  %2 = load <4 x float>, <4 x float> *%a3
-  %3 = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
-  ret <4 x float> %3
+  tail call void asm "vfnmadd132ss $2, $1, $0 \0A\09 vfnmadd213ss $2, $1, $0 \0A\09 vfnmadd231ss $2, $1, $0 \0A\09 vfnmadd132ss $3, $1, $0 \0A\09 vfnmadd213ss $3, $1, $0 \0A\09 vfnmadd231ss $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+  ret void
 }
 
 ;
-; VFNMADD231 (TODO)
+; VFNMSUB
 ;
 
-;
-; VFNMSUB132 (TODO)
-;
-
-;
-; VFNMSUB213
-;
-
-define <2 x double> @test_vfnmsub213pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
-; GENERIC-LABEL: test_vfnmsub213pd:
+define void @test_vfnmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubpd_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmsub213pd:
+; HASWELL-LABEL: test_vfnmsubpd_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmsub213pd:
+; BROADWELL-LABEL: test_vfnmsubpd_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmsub213pd:
+; SKYLAKE-LABEL: test_vfnmsubpd_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmsub213pd:
+; KNL-LABEL: test_vfnmsubpd_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmsub213pd:
+; SKX-LABEL: test_vfnmsubpd_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmsub213pd:
+; ZNVER1-LABEL: test_vfnmsubpd_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
-  %2 = load <2 x double>, <2 x double> *%a3
-  %3 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
-  ret <2 x double> %3
+  tail call void asm "vfnmsub132pd $2, $1, $0 \0A\09 vfnmsub213pd $2, $1, $0 \0A\09 vfnmsub231pd $2, $1, $0 \0A\09 vfnmsub132pd $3, $1, $0 \0A\09 vfnmsub213pd $3, $1, $0 \0A\09 vfnmsub231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x double> @test_vfnmsub213pd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
-; GENERIC-LABEL: test_vfnmsub213pd_ymm:
+define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubpd_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmsub213pd_ymm:
+; HASWELL-LABEL: test_vfnmsubpd_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmsub213pd_ymm:
+; BROADWELL-LABEL: test_vfnmsubpd_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmsub213pd_ymm:
+; SKYLAKE-LABEL: test_vfnmsubpd_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmsub213pd_ymm:
+; KNL-LABEL: test_vfnmsubpd_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmsub213pd_ymm:
+; SKX-LABEL: test_vfnmsubpd_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmsub213pd_ymm:
+; ZNVER1-LABEL: test_vfnmsubpd_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
-  %2 = load <4 x double>, <4 x double> *%a3
-  %3 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
-  ret <4 x double> %3
+  tail call void asm "vfnmsub132pd $2, $1, $0 \0A\09 vfnmsub213pd $2, $1, $0 \0A\09 vfnmsub231pd $2, $1, $0 \0A\09 vfnmsub132pd $3, $1, $0 \0A\09 vfnmsub213pd $3, $1, $0 \0A\09 vfnmsub231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x float> @test_vfnmsub213ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
-; GENERIC-LABEL: test_vfnmsub213ps:
+define void @test_vfnmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubps_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmsub213ps:
+; HASWELL-LABEL: test_vfnmsubps_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; HASWELL-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmsub213ps:
+; BROADWELL-LABEL: test_vfnmsubps_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmsub213ps:
+; SKYLAKE-LABEL: test_vfnmsubps_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmsub213ps:
+; KNL-LABEL: test_vfnmsubps_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
 ; KNL-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmsub213ps:
+; SKX-LABEL: test_vfnmsubps_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmsub213ps:
+; ZNVER1-LABEL: test_vfnmsubps_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
-  %2 = load <4 x float>, <4 x float> *%a3
-  %3 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
-  ret <4 x float> %3
+  tail call void asm "vfnmsub132ps $2, $1, $0 \0A\09 vfnmsub213ps $2, $1, $0 \0A\09 vfnmsub231ps $2, $1, $0 \0A\09 vfnmsub132ps $3, $1, $0 \0A\09 vfnmsub213ps $3, $1, $0 \0A\09 vfnmsub231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+  ret void
 }
 
-define <8 x float> @test_vfnmsub213ps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
-; GENERIC-LABEL: test_vfnmsub213ps_ymm:
+define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubps_256:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
+; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmsub213ps_ymm:
+; HASWELL-LABEL: test_vfnmsubps_256:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; HASWELL-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT:    #NO_APP
+; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmsub213ps_ymm:
+; BROADWELL-LABEL: test_vfnmsubps_256:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; BROADWELL-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT:    #NO_APP
+; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmsub213ps_ymm:
+; SKYLAKE-LABEL: test_vfnmsubps_256:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    #NO_APP
+; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmsub213ps_ymm:
+; KNL-LABEL: test_vfnmsubps_256:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; KNL-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmsub213ps_ymm:
+; SKX-LABEL: test_vfnmsubps_256:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    #NO_APP
+; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmsub213ps_ymm:
+; ZNVER1-LABEL: test_vfnmsubps_256:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
+; ZNVER1-NEXT:    vzeroupper # sched: [100:?]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
-  %2 = load <8 x float>, <8 x float> *%a3
-  %3 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
-  ret <8 x float> %3
+  tail call void asm "vfnmsub132ps $2, $1, $0 \0A\09 vfnmsub213ps $2, $1, $0 \0A\09 vfnmsub231ps $2, $1, $0 \0A\09 vfnmsub132ps $3, $1, $0 \0A\09 vfnmsub213ps $3, $1, $0 \0A\09 vfnmsub231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+  ret void
 }
 
-define <2 x double> @test_vfnmsub213sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
-; GENERIC-LABEL: test_vfnmsub213sd:
+define void @test_vfnmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubsd_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmsub213sd:
+; HASWELL-LABEL: test_vfnmsubsd_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; HASWELL-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmsub213sd:
+; BROADWELL-LABEL: test_vfnmsubsd_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmsub213sd:
+; SKYLAKE-LABEL: test_vfnmsubsd_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmsub213sd:
+; KNL-LABEL: test_vfnmsubsd_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; KNL-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmsub213sd:
+; SKX-LABEL: test_vfnmsubsd_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmsub213sd:
+; ZNVER1-LABEL: test_vfnmsubsd_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
-  %2 = load <2 x double>, <2 x double> *%a3
-  %3 = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
-  ret <2 x double> %3
+  tail call void asm "vfnmsub132sd $2, $1, $0 \0A\09 vfnmsub213sd $2, $1, $0 \0A\09 vfnmsub231sd $2, $1, $0 \0A\09 vfnmsub132sd $3, $1, $0 \0A\09 vfnmsub213sd $3, $1, $0 \0A\09 vfnmsub231sd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+  ret void
 }
 
-define <4 x float> @test_vfnmsub213ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
-; GENERIC-LABEL: test_vfnmsub213ss:
+define void @test_vfnmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubss_128:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    #APP
+; GENERIC-NEXT:    vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; GENERIC-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT:    vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; GENERIC-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; HASWELL-LABEL: test_vfnmsub213ss:
+; HASWELL-LABEL: test_vfnmsubss_128:
 ; HASWELL:       # %bb.0:
+; HASWELL-NEXT:    #APP
+; HASWELL-NEXT:    vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; HASWELL-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT:    #NO_APP
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; BROADWELL-LABEL: test_vfnmsub213ss:
+; BROADWELL-LABEL: test_vfnmsubss_128:
 ; BROADWELL:       # %bb.0:
+; BROADWELL-NEXT:    #APP
+; BROADWELL-NEXT:    vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; BROADWELL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT:    vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; BROADWELL-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT:    #NO_APP
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKYLAKE-LABEL: test_vfnmsub213ss:
+; SKYLAKE-LABEL: test_vfnmsubss_128:
 ; SKYLAKE:       # %bb.0:
+; SKYLAKE-NEXT:    #APP
+; SKYLAKE-NEXT:    vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT:    vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKYLAKE-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT:    #NO_APP
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
-; KNL-LABEL: test_vfnmsub213ss:
+; KNL-LABEL: test_vfnmsubss_128:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    #APP
+; KNL-NEXT:    vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; KNL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
 ; KNL-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT:    #NO_APP
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
-; SKX-LABEL: test_vfnmsub213ss:
+; SKX-LABEL: test_vfnmsubss_128:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    #APP
+; SKX-NEXT:    vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
-; ZNVER1-LABEL: test_vfnmsub213ss:
+; ZNVER1-LABEL: test_vfnmsubss_128:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    #APP
+; ZNVER1-NEXT:    vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
 ; ZNVER1-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
 ; ZNVER1-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    #NO_APP
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
-  %2 = load <4 x float>, <4 x float> *%a3
-  %3 = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
-  ret <4 x float> %3
+  tail call void asm "vfnmsub132ss $2, $1, $0 \0A\09 vfnmsub213ss $2, $1, $0 \0A\09 vfnmsub231ss $2, $1, $0 \0A\09 vfnmsub132ss $3, $1, $0 \0A\09 vfnmsub213ss $3, $1, $0 \0A\09 vfnmsub231ss $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+  ret void
 }
-
-;
-; VFNMSUB231 (TODO)
-;
-
-
-declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
-declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
-declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
-declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
-declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
-
-declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
-declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
-declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
-
-declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
-declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
-declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
-
-declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
-declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
-declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
-declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
-declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
-
-declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
-declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
-declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
-declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
-declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
-
-declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
-declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
-declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
-declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
-declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)




More information about the llvm-commits mailing list