[llvm] r252060 - Created new X86 FMA3 opcodes (FMA*_Int) that are used now for lowering of scalar FMA intrinsics.

Quentin Colombet via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 4 10:31:06 PST 2015


Hi,

> On Nov 4, 2015, at 10:10 AM, Andrew Kaylor via llvm-commits <llvm-commits at lists.llvm.org> wrote:
> 
> Author: akaylor
> Date: Wed Nov  4 12:10:41 2015
> New Revision: 252060
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=252060&view=rev
> Log:
> Created new X86 FMA3 opcodes (FMA*_Int) that are used now for lowering of scalar FMA intrinsics.
> 
> Patch by Slava Klochkov 
> 
> The key difference between FMA* and FMA*_Int opcodes is that FMA*_Int opcodes are handled more conservatively. It is illegal to commute the 1st operand of FMA*_Int instructions as the upper bits of scalar FMA intrinsic result must be taken from the 1st operand, but such commute transformation would change those upper bits and invalidate the intrinsic's result.
> 
> Reviewers: Quentin Colombet, Elena Demikhovsky
> 
> Differential Revision: http://reviews.llvm.org/D13710
> 
> 
> Modified:
>    llvm/trunk/lib/Target/X86/X86InstrFMA.td
>    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
>    llvm/trunk/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
>    llvm/trunk/test/CodeGen/X86/fma-intrinsics-x86.ll
> 
> Modified: llvm/trunk/lib/Target/X86/X86InstrFMA.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFMA.td?rev=252060&r1=252059&r2=252060&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrFMA.td (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrFMA.td Wed Nov  4 12:10:41 2015
> @@ -126,9 +126,22 @@ let ExeDomain = SSEPackedDouble in {
>                                v4f64>, VEX_W;
> }
> 
> -let Constraints = "$src1 = $dst" in {
> -multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
> -                    RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
> +// All source register operands of FMA instructions can be commuted.
> +// In many cases such commute transformation requres an opcode adjustment,
> +// for example, commuting the operands 1 and 2 in FMA*132 form would require
> +// an opcode change to FMA*231:
> +//     FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
> +//     -->
> +//     FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
> +// Currently, the commute transformation is supported for only few FMA forms.
> +// That is the reason why \p IsRVariantCommutable and \p IsMVariantCommutable
> +// parameters are used here.
> +// The general commute operands optimization working for all forms is going
> +// to be implemented soon. (Please, see http://reviews.llvm.org/D13269
> +// for details).
> +let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
> +multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
> +                    X86MemOperand x86memop, RegisterClass RC,
>                     bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
>                     SDPatternOperator OpNode = null_frag> {
>   let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
> @@ -136,8 +149,7 @@ multiclass fma3s_rm<bits<8> opc, string
>                    (ins RC:$src1, RC:$src2, RC:$src3),
>                    !strconcat(OpcodeStr,
>                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
> -                   [(set RC:$dst,
> -                     (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
> +                   [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;
> 
>   let mayLoad = 1, isCommutable = IsMVariantCommutable in
>   def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
> @@ -145,52 +157,96 @@ multiclass fma3s_rm<bits<8> opc, string
>                    !strconcat(OpcodeStr,
>                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
>                    [(set RC:$dst,
> -                     (OpVT (OpNode RC:$src2, RC:$src1,
> -                            (mem_frag addr:$src3))))]>;
> +                     (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
> +}
> +} // Constraints = "$src1 = $dst", hasSideEffects = 0
> +
> +// These FMA*_Int instructions are defined specially for being used when
> +// the scalar FMA intrinsics are lowered to machine instructions, and in that
> +// sence they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
> +// instructions.
> +//
> +// The FMA*_Int instructions are _TEMPORARILY_ defined as NOT commutable.
> +// The upper bits of the result of scalar FMA intrinsics must be copied from
> +// the upper bits of the 1st operand. So, commuting the 1st operand would
> +// invalidate the upper bits of the intrinsic result.
> +// The corresponding optimization which allows commuting 2nd and 3rd operands
> +// of FMA*_Int instructions has been developed and is waiting for
> +// code-review approval and checkin (Please see http://reviews.llvm.org/D13269).
> +let Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1,
> +    hasSideEffects = 0 in {
> +multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
> +                        Operand memopr, RegisterClass RC> {
> +  def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
> +                   (ins RC:$src1, RC:$src2, RC:$src3),
> +                   !strconcat(OpcodeStr,
> +                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
> +                   []>;
> +
> +  let mayLoad = 1 in
> +  def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),
> +                   (ins RC:$src1, RC:$src2, memopr:$src3),
> +                   !strconcat(OpcodeStr,
> +                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
> +                   []>;
> }
> -} // Constraints = "$src1 = $dst"
> +} // Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1,
> +  // hasSideEffects = 0
> 
> multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
> -                       string OpStr, string PackTy, string PT2, Intrinsic Int,
> -                       SDNode OpNode, RegisterClass RC, ValueType OpVT,
> -                       X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
> -                       ComplexPattern mem_cpat> {
> -let hasSideEffects = 0 in {
> -  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
> -                       x86memop, RC, OpVT, mem_frag>;
> -  // See the other defm of r231 for the explanation regarding the
> -  // commutable flags.
> -  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
> -                       x86memop, RC, OpVT, mem_frag,
> +                       string OpStr, string PackTy,
> +                       SDNode OpNode, RegisterClass RC,
> +                       X86MemOperand x86memop> {
> +  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>;
> +  defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC,
> +                       /* IsRVariantCommutable */ 1,
> +                       /* IsMVariantCommutable */ 1,
> +                       OpNode>;
> +  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC,
>                        /* IsRVariantCommutable */ 1,
> -                       /* IsMVariantCommutable */ 0>;
> +                       /* IsMVariantCommutable */ 0,
> +                       null_frag>;
> }
> 
> -// See the other defm of r213 for the explanation regarding the
> -// commutable flags.
> -defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
> -                     x86memop, RC, OpVT, mem_frag,
> -                     /* IsRVariantCommutable */ 1,
> -                     /* IsMVariantCommutable */ 1,
> -                     OpNode>;
> +// The FMA 213 form is created for lowering of scalar FMA intrinscis
> +// to machine instructions.
> +// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
> +// of FMA 213 form.
> +// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132
> +// forms and is possible only after special analysis of all uses of the initial
> +// instruction. Such analysis do not exist yet and thus introducing the 231
> +// form of FMA*_Int instructions is done using an optimistic assumption that
> +// such analysis will be implemented eventually.
> +multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
> +                           string OpStr, string PackTy,
> +                           RegisterClass RC, Operand memop> {
> +  defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
> +                           memop, RC>;
> +  defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
> +                           memop, RC>;
> +  defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
> +                           memop, RC>;
> }
> 
> multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
>                  string OpStr, Intrinsic IntF32, Intrinsic IntF64,
>                  SDNode OpNode> {
> -  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,
> -                        FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
> -  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
> -                        FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
> -
> -// These patterns use the 123 ordering, instead of 213, even though
> -// they match the intrinsic to the 213 version of the instruction.
> -// This is because src1 is tied to dest, and the scalar intrinsics
> -// require the pass-through values to come from the first source
> -// operand, not the second.
> +  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode,
> +                        FR32, f32mem>,
> +            fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>;
> +  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode,
> +                        FR64, f64mem>,
> +            fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>,
> +            VEX_W;
> +
> +  // These patterns use the 123 ordering, instead of 213, even though
> +  // they match the intrinsic to the 213 version of the instruction.
> +  // This is because src1 is tied to dest, and the scalar intrinsics
> +  // require the pass-through values to come from the first source
> +  // operand, not the second.
>   def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
>             (COPY_TO_REGCLASS
> -              (!cast<Instruction>(NAME#"SSr213r")
> +              (!cast<Instruction>(NAME#"SSr213r_Int")
>                 (COPY_TO_REGCLASS $src1, FR32),
>                 (COPY_TO_REGCLASS $src2, FR32),
>                 (COPY_TO_REGCLASS $src3, FR32)),
> @@ -198,7 +254,7 @@ multiclass fma3s<bits<8> opc132, bits<8>
> 
>   def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
>             (COPY_TO_REGCLASS
> -              (!cast<Instruction>(NAME#"SDr213r")
> +              (!cast<Instruction>(NAME#"SDr213r_Int")
>                 (COPY_TO_REGCLASS $src1, FR64),
>                 (COPY_TO_REGCLASS $src2, FR64),
>                 (COPY_TO_REGCLASS $src3, FR64)),
> 
> Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=252060&r1=252059&r2=252060&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Wed Nov  4 12:10:41 2015
> @@ -1734,11 +1734,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
>   static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
>     // FMA foldable instructions
>     { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         TB_ALIGN_NONE },
> +    { X86::VFMADDSSr231r_Int,     X86::VFMADDSSr231m_Int,     TB_ALIGN_NONE },
>     { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         TB_ALIGN_NONE },
> +    { X86::VFMADDSDr231r_Int,     X86::VFMADDSDr231m_Int,     TB_ALIGN_NONE },
>     { X86::VFMADDSSr132r,         X86::VFMADDSSr132m,         TB_ALIGN_NONE },
> +    { X86::VFMADDSSr132r_Int,     X86::VFMADDSSr132m_Int,     TB_ALIGN_NONE },
>     { X86::VFMADDSDr132r,         X86::VFMADDSDr132m,         TB_ALIGN_NONE },
> +    { X86::VFMADDSDr132r_Int,     X86::VFMADDSDr132m_Int,     TB_ALIGN_NONE },
>     { X86::VFMADDSSr213r,         X86::VFMADDSSr213m,         TB_ALIGN_NONE },
> +    { X86::VFMADDSSr213r_Int,     X86::VFMADDSSr213m_Int,     TB_ALIGN_NONE },
>     { X86::VFMADDSDr213r,         X86::VFMADDSDr213m,         TB_ALIGN_NONE },
> +    { X86::VFMADDSDr213r_Int,     X86::VFMADDSDr213m_Int,     TB_ALIGN_NONE },
> 
>     { X86::VFMADDPSr231r,         X86::VFMADDPSr231m,         TB_ALIGN_NONE },
>     { X86::VFMADDPDr231r,         X86::VFMADDPDr231m,         TB_ALIGN_NONE },
> @@ -1754,11 +1760,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
>     { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_NONE },
> 
>     { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        TB_ALIGN_NONE },
> +    { X86::VFNMADDSSr231r_Int,    X86::VFNMADDSSr231m_Int,    TB_ALIGN_NONE },
>     { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        TB_ALIGN_NONE },
> +    { X86::VFNMADDSDr231r_Int,    X86::VFNMADDSDr231m_Int,    TB_ALIGN_NONE },
>     { X86::VFNMADDSSr132r,        X86::VFNMADDSSr132m,        TB_ALIGN_NONE },
> +    { X86::VFNMADDSSr132r_Int,    X86::VFNMADDSSr132m_Int,    TB_ALIGN_NONE },
>     { X86::VFNMADDSDr132r,        X86::VFNMADDSDr132m,        TB_ALIGN_NONE },
> +    { X86::VFNMADDSDr132r_Int,    X86::VFNMADDSDr132m_Int,    TB_ALIGN_NONE },
>     { X86::VFNMADDSSr213r,        X86::VFNMADDSSr213m,        TB_ALIGN_NONE },
> +    { X86::VFNMADDSSr213r_Int,    X86::VFNMADDSSr213m_Int,    TB_ALIGN_NONE },
>     { X86::VFNMADDSDr213r,        X86::VFNMADDSDr213m,        TB_ALIGN_NONE },
> +    { X86::VFNMADDSDr213r_Int,    X86::VFNMADDSDr213m_Int,    TB_ALIGN_NONE },
> 
>     { X86::VFNMADDPSr231r,        X86::VFNMADDPSr231m,        TB_ALIGN_NONE },
>     { X86::VFNMADDPDr231r,        X86::VFNMADDPDr231m,        TB_ALIGN_NONE },
> @@ -1774,11 +1786,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
>     { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_NONE },
> 
>     { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         TB_ALIGN_NONE },
> +    { X86::VFMSUBSSr231r_Int,     X86::VFMSUBSSr231m_Int,     TB_ALIGN_NONE },
>     { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         TB_ALIGN_NONE },
> +    { X86::VFMSUBSDr231r_Int,     X86::VFMSUBSDr231m_Int,     TB_ALIGN_NONE },
>     { X86::VFMSUBSSr132r,         X86::VFMSUBSSr132m,         TB_ALIGN_NONE },
> +    { X86::VFMSUBSSr132r_Int,     X86::VFMSUBSSr132m_Int,     TB_ALIGN_NONE },
>     { X86::VFMSUBSDr132r,         X86::VFMSUBSDr132m,         TB_ALIGN_NONE },
> +    { X86::VFMSUBSDr132r_Int,     X86::VFMSUBSDr132m_Int,     TB_ALIGN_NONE },
>     { X86::VFMSUBSSr213r,         X86::VFMSUBSSr213m,         TB_ALIGN_NONE },
> +    { X86::VFMSUBSSr213r_Int,     X86::VFMSUBSSr213m_Int,     TB_ALIGN_NONE },
>     { X86::VFMSUBSDr213r,         X86::VFMSUBSDr213m,         TB_ALIGN_NONE },
> +    { X86::VFMSUBSDr213r_Int,     X86::VFMSUBSDr213m_Int,     TB_ALIGN_NONE },
> 
>     { X86::VFMSUBPSr231r,         X86::VFMSUBPSr231m,         TB_ALIGN_NONE },
>     { X86::VFMSUBPDr231r,         X86::VFMSUBPDr231m,         TB_ALIGN_NONE },
> @@ -1794,11 +1812,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
>     { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_NONE },
> 
>     { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        TB_ALIGN_NONE },
> +    { X86::VFNMSUBSSr231r_Int,    X86::VFNMSUBSSr231m_Int,    TB_ALIGN_NONE },
>     { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        TB_ALIGN_NONE },
> +    { X86::VFNMSUBSDr231r_Int,    X86::VFNMSUBSDr231m_Int,    TB_ALIGN_NONE },
>     { X86::VFNMSUBSSr132r,        X86::VFNMSUBSSr132m,        TB_ALIGN_NONE },
> +    { X86::VFNMSUBSSr132r_Int,    X86::VFNMSUBSSr132m_Int,    TB_ALIGN_NONE },
>     { X86::VFNMSUBSDr132r,        X86::VFNMSUBSDr132m,        TB_ALIGN_NONE },
> +    { X86::VFNMSUBSDr132r_Int,    X86::VFNMSUBSDr132m_Int,    TB_ALIGN_NONE },
>     { X86::VFNMSUBSSr213r,        X86::VFNMSUBSSr213m,        TB_ALIGN_NONE },
> +    { X86::VFNMSUBSSr213r_Int,    X86::VFNMSUBSSr213m_Int,    TB_ALIGN_NONE },
>     { X86::VFNMSUBSDr213r,        X86::VFNMSUBSDr213m,        TB_ALIGN_NONE },
> +    { X86::VFNMSUBSDr213r_Int,    X86::VFNMSUBSDr213m_Int,    TB_ALIGN_NONE },
> 
>     { X86::VFNMSUBPSr231r,        X86::VFNMSUBPSr231m,        TB_ALIGN_NONE },
>     { X86::VFNMSUBPDr231r,        X86::VFNMSUBPDr231m,        TB_ALIGN_NONE },
> 
> Modified: llvm/trunk/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll?rev=252060&r1=252059&r2=252060&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll Wed Nov  4 12:10:41 2015
> @@ -1,8 +1,337 @@
> ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s
> 
> -; CHECK-LABEL: fmaddsubpd_loop:
> -; CHECK:   vfmaddsub231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
> -define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
> +; CHECK-LABEL: fmaddsubpd_loop_128:
> +; CHECK:   vfmaddsub231pd %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: req

Those check lines seem fragile to me. Although we read %a, %b, and %c, which come from the ABI, we do not have any guarantee that they were not be redefined.
Well, too be fair, based on the IR, that shouldn’t happen, but the function seems simple enough that we could add a few more check lines to check this indeed do not happen.

> +define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <2 x double> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fmsubaddpd_loop_128:
> +; CHECK:   vfmsubadd231pd %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: req

Ditto.

> +define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <2 x double> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fmaddpd_loop_128:
> +; CHECK:   vfmadd231pd %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: req

Ditto, etc.

> +define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <2 x double> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fmsubpd_loop_128:
> +; CHECK:   vfmsub231pd %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: retq
> +define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <2 x double> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fnmaddpd_loop_128:
> +; CHECK:   vfnmadd231pd %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: retq
> +define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <2 x double> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fnmsubpd_loop_128:
> +; CHECK:   vfnmsub231pd %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: retq
> +define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <2 x double> %c.addr.0
> +}
> +
> +declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
> +declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
> +declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
> +declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
> +declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
> +declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
> +
> +
> +; CHECK-LABEL: fmaddsubps_loop_128:
> +; CHECK:   vfmaddsub231ps %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: req
> +define <4 x float> @fmaddsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <4 x float> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fmsubaddps_loop_128:
> +; CHECK:   vfmsubadd231ps %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: retq
> +define <4 x float> @fmsubaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <4 x float> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fmaddps_loop_128:
> +; CHECK:   vfmadd231ps %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: retq
> +define <4 x float> @fmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <4 x float> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fmsubps_loop_128:
> +; CHECK:   vfmsub231ps %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: retq
> +define <4 x float> @fmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <4 x float> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fnmaddps_loop_128:
> +; CHECK:   vfnmadd231ps %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: retq
> +define <4 x float> @fnmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <4 x float> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fnmsubps_loop_128:
> +; CHECK:   vfnmsub231ps %xmm1, %xmm0, %xmm2
> +; CHECK:   vmovaps %xmm2, %xmm0
> +; CHECK-NEXT: retq
> +define <4 x float> @fnmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <4 x float> %c.addr.0
> +}
> +
> +declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
> +declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
> +declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
> +declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
> +declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
> +declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
> +
> +; CHECK-LABEL: fmaddsubpd_loop_256:
> +; CHECK:   vfmaddsub231pd %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
> entry:
>   br label %for.cond
> 
> @@ -24,9 +353,11 @@ for.end:
>   ret <4 x double> %c.addr.0
> }
> 
> -; CHECK-LABEL: fmsubaddpd_loop:
> -; CHECK:   vfmsubadd231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
> -define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
> +; CHECK-LABEL: fmsubaddpd_loop_256:
> +; CHECK:   vfmsubadd231pd %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
> entry:
>   br label %for.cond
> 
> @@ -48,9 +379,11 @@ for.end:
>   ret <4 x double> %c.addr.0
> }
> 
> -; CHECK-LABEL: fmaddpd_loop:
> -; CHECK:   vfmadd231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
> -define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
> +; CHECK-LABEL: fmaddpd_loop_256:
> +; CHECK:   vfmadd231pd %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
> entry:
>   br label %for.cond
> 
> @@ -72,9 +405,11 @@ for.end:
>   ret <4 x double> %c.addr.0
> }
> 
> -; CHECK-LABEL: fmsubpd_loop:
> -; CHECK:   vfmsub231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
> -define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
> +; CHECK-LABEL: fmsubpd_loop_256:
> +; CHECK:   vfmsub231pd %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
> entry:
>   br label %for.cond
> 
> @@ -96,15 +431,71 @@ for.end:
>   ret <4 x double> %c.addr.0
> }
> 
> +; CHECK-LABEL: fnmaddpd_loop_256:
> +; CHECK:   vfnmadd231pd %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <4 x double> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fnmsubpd_loop_256:
> +; CHECK:   vfnmsub231pd %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <4 x double> %c.addr.0
> +}
> +
> declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
> declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
> declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
> declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
> +declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
> +declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
> 
> 
> -; CHECK-LABEL: fmaddsubps_loop:
> -; CHECK:   vfmaddsub231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
> -define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
> +; CHECK-LABEL: fmaddsubps_loop_256:
> +; CHECK:   vfmaddsub231ps %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <8 x float> @fmaddsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
> entry:
>   br label %for.cond
> 
> @@ -126,9 +517,11 @@ for.end:
>   ret <8 x float> %c.addr.0
> }
> 
> -; CHECK-LABEL: fmsubaddps_loop:
> -; CHECK:   vfmsubadd231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
> -define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
> +; CHECK-LABEL: fmsubaddps_loop_256:
> +; CHECK:   vfmsubadd231ps %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <8 x float> @fmsubaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
> entry:
>   br label %for.cond
> 
> @@ -150,9 +543,11 @@ for.end:
>   ret <8 x float> %c.addr.0
> }
> 
> -; CHECK-LABEL: fmaddps_loop:
> -; CHECK:   vfmadd231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
> -define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
> +; CHECK-LABEL: fmaddps_loop_256:
> +; CHECK:   vfmadd231ps %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <8 x float> @fmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
> entry:
>   br label %for.cond
> 
> @@ -174,9 +569,11 @@ for.end:
>   ret <8 x float> %c.addr.0
> }
> 
> -; CHECK-LABEL: fmsubps_loop:
> -; CHECK:   vfmsub231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
> -define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
> +; CHECK-LABEL: fmsubps_loop_256:
> +; CHECK:   vfmsub231ps %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <8 x float> @fmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
> entry:
>   br label %for.cond
> 
> @@ -198,7 +595,61 @@ for.end:
>   ret <8 x float> %c.addr.0
> }
> 
> +; CHECK-LABEL: fnmaddps_loop_256:
> +; CHECK:   vfnmadd231ps %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <8 x float> @fnmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <8 x float> %c.addr.0
> +}
> +
> +; CHECK-LABEL: fnmsubps_loop_256:
> +; CHECK:   vfnmsub231ps %ymm1, %ymm0, %ymm2
> +; CHECK:   vmovaps %ymm2, %ymm0
> +; CHECK-NEXT: retq
> +define <8 x float> @fnmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
> +entry:
> +  br label %for.cond
> +
> +for.cond:
> +  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
> +  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
> +  %cmp = icmp slt i32 %i.0, %iter
> +  br i1 %cmp, label %for.body, label %for.end
> +
> +for.body:
> +  br label %for.inc
> +
> +for.inc:
> +  %0 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
> +  %inc = add nsw i32 %i.0, 1
> +  br label %for.cond
> +
> +for.end:
> +  ret <8 x float> %c.addr.0
> +}
> +
> declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
> declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
> declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
> declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
> +declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
> +declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
> 
> Modified: llvm/trunk/test/CodeGen/X86/fma-intrinsics-x86.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fma-intrinsics-x86.ll?rev=252060&r1=252059&r2=252060&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/fma-intrinsics-x86.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/fma-intrinsics-x86.ll Wed Nov  4 12:10:41 2015
> @@ -1,95 +1,149 @@
> ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
> ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
> +; RUN: llc < %s -mtriple=x86_64-pc-windows -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
> ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
> ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
> ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
> 
> ; VFMADD
> define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmadd_ss:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdi)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmadd213ss     (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ss:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
>   ret <4 x float> %res
> }
> +
> +define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> +; CHECK-LABEL: test_x86_fma_vfmadd_bac_ss:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0
> +;
> +; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1
> +; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
> +;
> +; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0
> +; CHECK-NEXT: retq
> +  %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
> +  ret <4 x float> %res
> +}
> declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
> 
> define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmadd_sd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_sd:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
>   ret <2 x double> %res
> }
> +
> +define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> +; CHECK-LABEL: test_x86_fma_vfmadd_bac_sd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0
> +;
> +; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm1
> +; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
> +;
> +; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0
> +;
> +; CHECK-NEXT: retq
> +  %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
> +  ret <2 x double> %res
> +}
> declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
> 
> define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmadd_ps:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
>   ret <4 x float> %res
> }
> declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
> 
> define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmadd_pd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
>   ret <2 x double> %res
> }
> declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
> 
> define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmadd_ps_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
>   ret <8 x float> %res
> }
> declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
> 
> define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmadd_pd_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
>   ret <4 x double> %res
> }
> @@ -97,90 +151,144 @@ declare <4 x double> @llvm.x86.fma.vfmad
> 
> ; VFMSUB
> define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmsub_ss:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ss:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
>   ret <4 x float> %res
> }
> +
> +define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> +; CHECK-LABEL: test_x86_fma_vfmsub_bac_ss:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0
> +;
> +; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm1
> +; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
> +;
> +; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0
> +;
> +; CHECK-NEXT: retq
> +  %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
> +  ret <4 x float> %res
> +}
> declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
> 
> define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmsub_sd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_sd:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
>   ret <2 x double> %res
> }
> +
> +define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> +; CHECK-LABEL: test_x86_fma_vfmsub_bac_sd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0
> +;
> +; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1
> +; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
> +;
> +; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0
> +;
> +; CHECK-NEXT: retq
> +  %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
> +  ret <2 x double> %res
> +}
> declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
> 
> define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmsub_ps:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
>   ret <4 x float> %res
> }
> declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
> 
> define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmsub_pd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
>   ret <2 x double> %res
> }
> declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
> 
> define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmsub_ps_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
>   ret <8 x float> %res
> }
> declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
> 
> define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmsub_pd_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
>   ret <4 x double> %res
> }
> @@ -188,90 +296,144 @@ declare <4 x double> @llvm.x86.fma.vfmsu
> 
> ; VFNMADD
> define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmadd_ss:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ss:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
>   ret <4 x float> %res
> }
> +
> +define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> +; CHECK-LABEL: test_x86_fma_vfnmadd_bac_ss:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0
> +;
> +; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm1
> +; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
> +;
> +; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0
> +;
> +; CHECK-NEXT: retq
> +  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
> +  ret <4 x float> %res
> +}
> declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
> 
> define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmadd_sd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_sd:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
>   ret <2 x double> %res
> }
> +
> +define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> +; CHECK-LABEL: test_x86_fma_vfnmadd_bac_sd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0
> +;
> +; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm0, %xmm1
> +; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
> +;
> +; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0
> +;
> +; CHECK-NEXT: retq
> +  %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
> +  ret <2 x double> %res
> +}
> declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
> 
> define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmadd_ps:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
>   ret <4 x float> %res
> }
> declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
> 
> define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmadd_pd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
>   ret <2 x double> %res
> }
> declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
> 
> define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmadd_ps_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
>   ret <8 x float> %res
> }
> declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
> 
> define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmadd_pd_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
>   ret <4 x double> %res
> }
> @@ -279,90 +441,144 @@ declare <4 x double> @llvm.x86.fma.vfnma
> 
> ; VFNMSUB
> define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmsub_ss:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ss:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
>   ret <4 x float> %res
> }
> +
> +define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> +; CHECK-LABEL: test_x86_fma_vfnmsub_bac_ss:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0
> +;
> +; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm1
> +; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
> +;
> +; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0
> +;
> +; CHECK-NEXT: retq
> +  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
> +  ret <4 x float> %res
> +}
> declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
> 
> define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmsub_sd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_sd:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
>   ret <2 x double> %res
> }
> +
> +define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> +; CHECK-LABEL: test_x86_fma_vfnmsub_bac_sd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0
> +;
> +; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
> +; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
> +;
> +; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0
> +;
> +; CHECK-NEXT: retq
> +  %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
> +  ret <2 x double> %res
> +}
> declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
> 
> define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmsub_ps:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
>   ret <4 x float> %res
> }
> declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
> 
> define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmsub_pd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
>   ret <2 x double> %res
> }
> declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
> 
> define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmsub_ps_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
>   ret <8 x float> %res
> }
> declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
> 
> define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfnmsub_pd_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
>   ret <4 x double> %res
> }
> @@ -370,60 +586,72 @@ declare <4 x double> @llvm.x86.fma.vfnms
> 
> ; VFMADDSUB
> define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmaddsub_ps:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
>   ret <4 x float> %res
> }
> declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
> 
> define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmaddsub_pd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
>   ret <2 x double> %res
> }
> declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
> 
> define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmaddsub_ps_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
>   ret <8 x float> %res
> }
> declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
> 
> define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmaddsub_pd_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
>   ret <4 x double> %res
> }
> @@ -431,60 +659,72 @@ declare <4 x double> @llvm.x86.fma.vfmad
> 
> ; VFMSUBADD
> define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmsubadd_ps:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
>   ret <4 x float> %res
> }
> declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
> 
> define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmsubadd_pd:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0
> +;
> ; CHECK-FMA-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
>   ret <2 x double> %res
> }
> declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
> 
> define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmsubadd_ps_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
>   ret <8 x float> %res
> }
> declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
> 
> define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
> -; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256:
> -; CHECK-FMA:       # BB#0:
> +; CHECK-LABEL: test_x86_fma_vfmsubadd_pd_256:
> +; CHECK-NEXT:  # BB#0:
> +;
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
> +; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0
> +;
> ; CHECK-FMA-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0
> -; CHECK-FMA-NEXT:    retq
> ;
> -; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd_256:
> -; CHECK-FMA4:       # BB#0:
> -; CHECK-FMA4-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
> -; CHECK-FMA4-NEXT:    retq
> +; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
> +;
> +; CHECK-NEXT: retq
>   %res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
>   ret <4 x double> %res
> }
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits



More information about the llvm-commits mailing list