[llvm] r313142 - [X86][FMA] Added *213 fma instructions to scheduling tests

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 13 04:12:57 PDT 2017


Author: rksimon
Date: Wed Sep 13 04:12:56 2017
New Revision: 313142

URL: http://llvm.org/viewvc/llvm-project?rev=313142&view=rev
Log:
[X86][FMA] Added *213 fma instructions to scheduling tests

Annoyingly the 132/231 variants are pretty tricky to create when you need to due to weak FMA commutation patterns.

Added:
    llvm/trunk/test/CodeGen/X86/fma-schedule.ll

Added: llvm/trunk/test/CodeGen/X86/fma-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fma-schedule.ll?rev=313142&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fma-schedule.ll (added)
+++ llvm/trunk/test/CodeGen/X86/fma-schedule.ll Wed Sep 13 04:12:56 2017
@@ -0,0 +1,1269 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+;
+; VFMADD132 (TODO)
+;
+
+;
+; VFMADD213
+;
+
+define <2 x double> @test_vfmadd213pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_vfmadd213pd:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmadd213pd:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmadd213pd:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmadd213pd:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmadd213pd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = load <2 x double>, <2 x double> *%a3
+  %3 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <4 x double> @test_vfmadd213pd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
+; GENERIC-LABEL: test_vfmadd213pd_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmadd213pd_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmadd213pd_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmadd213pd_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmadd213pd_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  %2 = load <4 x double>, <4 x double> *%a3
+  %3 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
+  ret <4 x double> %3
+}
+
+define <4 x float> @test_vfmadd213ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
+; GENERIC-LABEL: test_vfmadd213ps:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmadd213ps:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmadd213ps:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmadd213ps:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmadd213ps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = load <4 x float>, <4 x float> *%a3
+  %3 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+define <8 x float> @test_vfmadd213ps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
+; GENERIC-LABEL: test_vfmadd213ps_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmadd213ps_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmadd213ps_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmadd213ps_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmadd213ps_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+  %2 = load <8 x float>, <8 x float> *%a3
+  %3 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
+  ret <8 x float> %3
+}
+
+define <2 x double> @test_vfmadd213sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_vfmadd213sd:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmadd213sd:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmadd213sd:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmadd213sd:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmadd213sd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = load <2 x double>, <2 x double> *%a3
+  %3 = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <4 x float> @test_vfmadd213ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
+; GENERIC-LABEL: test_vfmadd213ss:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmadd213ss:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmadd213ss:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmadd213ss:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmadd213ss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = load <4 x float>, <4 x float> *%a3
+  %3 = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+;
+; VFMADD231 (TODO)
+;
+
+;
+; VFMADDSUB132 (TODO)
+;
+
+;
+; VFMADDSUB213
+;
+
+define <2 x double> @test_vfmaddsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_vfmaddsubpd:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddsubpd:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddsubpd:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmaddsubpd:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddsubpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = load <2 x double>, <2 x double> *%a3
+  %3 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <4 x double> @test_vfmaddsubpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a4, <4 x double> *%a3) {
+; GENERIC-LABEL: test_vfmaddsubpd_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddsubpd_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddsubpd_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmaddsubpd_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddsubpd_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a4)
+  %2 = load <4 x double>, <4 x double> *%a3
+  %3 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
+  ret <4 x double> %3
+}
+
+define <4 x float> @test_vfmaddsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a4, <4 x float> *%a3) {
+; GENERIC-LABEL: test_vfmaddsubps:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddsubps:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddsubps:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmaddsubps:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddsubps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a4)
+  %2 = load <4 x float>, <4 x float> *%a3
+  %3 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+define <8 x float> @test_vfmaddsubps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a8, <8 x float> *%a3) {
+; GENERIC-LABEL: test_vfmaddsubps_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddsubps_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddsubps_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmaddsubps_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddsubps_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a8)
+  %2 = load <8 x float>, <8 x float> *%a3
+  %3 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
+  ret <8 x float> %3
+}
+
+;
+; VFMADDSUB231 (TODO)
+;
+
+;
+; VFMSUBADD132 (TODO)
+;
+
+;
+; VFMSUBADD213
+;
+
+define <2 x double> @test_vfmsubaddpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_vfmsubaddpd:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubaddpd:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubaddpd:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmsubaddpd:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubaddpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = load <2 x double>, <2 x double> *%a3
+  %3 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <4 x double> @test_vfmsubaddpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a4, <4 x double> *%a3) {
+; GENERIC-LABEL: test_vfmsubaddpd_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubaddpd_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubaddpd_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmsubaddpd_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubaddpd_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a4)
+  %2 = load <4 x double>, <4 x double> *%a3
+  %3 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
+  ret <4 x double> %3
+}
+
+define <4 x float> @test_vfmsubaddps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a4, <4 x float> *%a3) {
+; GENERIC-LABEL: test_vfmsubaddps:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubaddps:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubaddps:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmsubaddps:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubaddps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a4)
+  %2 = load <4 x float>, <4 x float> *%a3
+  %3 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+define <8 x float> @test_vfmsubaddps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a8, <8 x float> *%a3) {
+; GENERIC-LABEL: test_vfmsubaddps_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubaddps_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubaddps_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmsubaddps_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubaddps_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a8)
+  %2 = load <8 x float>, <8 x float> *%a3
+  %3 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
+  ret <8 x float> %3
+}
+
+;
+; VFMSUBADD231 (TODO)
+;
+
+;
+; VFMSUB132 (TODO)
+;
+
+;
+; VFMSUB213
+;
+
+define <2 x double> @test_vfmsub213pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_vfmsub213pd:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsub213pd:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsub213pd:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmsub213pd:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmsub213pd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = load <2 x double>, <2 x double> *%a3
+  %3 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <4 x double> @test_vfmsub213pd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
+; GENERIC-LABEL: test_vfmsub213pd_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsub213pd_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsub213pd_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmsub213pd_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmsub213pd_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  %2 = load <4 x double>, <4 x double> *%a3
+  %3 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
+  ret <4 x double> %3
+}
+
+define <4 x float> @test_vfmsub213ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
+; GENERIC-LABEL: test_vfmsub213ps:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsub213ps:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsub213ps:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmsub213ps:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmsub213ps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = load <4 x float>, <4 x float> *%a3
+  %3 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+define <8 x float> @test_vfmsub213ps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
+; GENERIC-LABEL: test_vfmsub213ps_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsub213ps_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsub213ps_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmsub213ps_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmsub213ps_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+  %2 = load <8 x float>, <8 x float> *%a3
+  %3 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
+  ret <8 x float> %3
+}
+
+define <2 x double> @test_vfmsub213sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_vfmsub213sd:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsub213sd:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsub213sd:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmsub213sd:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmsub213sd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = load <2 x double>, <2 x double> *%a3
+  %3 = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <4 x float> @test_vfmsub213ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
+; GENERIC-LABEL: test_vfmsub213ss:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsub213ss:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsub213ss:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfmsub213ss:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfmsub213ss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = load <4 x float>, <4 x float> *%a3
+  %3 = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+;
+; VFMSUB231 (TODO)
+;
+
+;
+; VFNMADD132 (TODO)
+;
+
+;
+; VFNMADD213
+;
+
+define <2 x double> @test_vfnmadd213pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_vfnmadd213pd:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmadd213pd:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmadd213pd:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmadd213pd:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmadd213pd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = load <2 x double>, <2 x double> *%a3
+  %3 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <4 x double> @test_vfnmadd213pd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
+; GENERIC-LABEL: test_vfnmadd213pd_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmadd213pd_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmadd213pd_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmadd213pd_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmadd213pd_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  %2 = load <4 x double>, <4 x double> *%a3
+  %3 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
+  ret <4 x double> %3
+}
+
+define <4 x float> @test_vfnmadd213ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
+; GENERIC-LABEL: test_vfnmadd213ps:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmadd213ps:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmadd213ps:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmadd213ps:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmadd213ps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = load <4 x float>, <4 x float> *%a3
+  %3 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+define <8 x float> @test_vfnmadd213ps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
+; GENERIC-LABEL: test_vfnmadd213ps_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmadd213ps_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmadd213ps_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmadd213ps_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmadd213ps_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+  %2 = load <8 x float>, <8 x float> *%a3
+  %3 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
+  ret <8 x float> %3
+}
+
+define <2 x double> @test_vfnmadd213sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_vfnmadd213sd:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmadd213sd:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmadd213sd:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmadd213sd:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmadd213sd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = load <2 x double>, <2 x double> *%a3
+  %3 = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <4 x float> @test_vfnmadd213ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
+; GENERIC-LABEL: test_vfnmadd213ss:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmadd213ss:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmadd213ss:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmadd213ss:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmadd213ss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = load <4 x float>, <4 x float> *%a3
+  %3 = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+;
+; VFNMADD231 (TODO)
+;
+
+;
+; VFNMSUB132 (TODO)
+;
+
+;
+; VFNMSUB213
+;
+
+define <2 x double> @test_vfnmsub213pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_vfnmsub213pd:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsub213pd:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsub213pd:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmsub213pd:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsub213pd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = load <2 x double>, <2 x double> *%a3
+  %3 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <4 x double> @test_vfnmsub213pd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
+; GENERIC-LABEL: test_vfnmsub213pd_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsub213pd_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsub213pd_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmsub213pd_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsub213pd_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  %2 = load <4 x double>, <4 x double> *%a3
+  %3 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
+  ret <4 x double> %3
+}
+
+define <4 x float> @test_vfnmsub213ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
+; GENERIC-LABEL: test_vfnmsub213ps:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsub213ps:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsub213ps:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmsub213ps:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsub213ps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = load <4 x float>, <4 x float> *%a3
+  %3 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+define <8 x float> @test_vfnmsub213ps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
+; GENERIC-LABEL: test_vfnmsub213ps_ymm:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
+; GENERIC-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsub213ps_ymm:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsub213ps_ymm:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmsub213ps_ymm:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsub213ps_ymm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
+; ZNVER1-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+  %2 = load <8 x float>, <8 x float> *%a3
+  %3 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
+  ret <8 x float> %3
+}
+
+define <2 x double> @test_vfnmsub213sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_vfnmsub213sd:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsub213sd:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsub213sd:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmsub213sd:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsub213sd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = load <2 x double>, <2 x double> *%a3
+  %3 = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <4 x float> @test_vfnmsub213ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
+; GENERIC-LABEL: test_vfnmsub213ss:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0
+; GENERIC-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsub213ss:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT:    retq # sched: [2:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsub213ss:
+; SKYLAKE:       # BB#0:
+; SKYLAKE-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; SKYLAKE-NEXT:    retq # sched: [2:1.00]
+;
+; KNL-LABEL: test_vfnmsub213ss:
+; KNL:       # BB#0:
+; KNL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT:    retq # sched: [2:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsub213ss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0
+; ZNVER1-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0
+; ZNVER1-NEXT:    retq # sched: [1:0.50]
+  %1 = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = load <4 x float>, <4 x float> *%a3
+  %3 = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+;
+; VFNMSUB231 (TODO)
+;
+
+
+declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)




More information about the llvm-commits mailing list