[llvm-commits] [llvm] r157737 - in /llvm/trunk: lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp lib/Target/X86/X86CodeEmitter.cpp lib/Target/X86/X86InstrFMA.td lib/Target/X86/X86InstrInfo.cpp lib/Target/X86/X86InstrInfo.h lib/Target/X86/X86Subta

Craig Topper craig.topper at gmail.com
Thu May 31 07:19:31 PDT 2012


Wouldn't it have been better to enable the intrinsics, but not allow
converting fmul(fadd) to FMA? That way people can still use FMA if they
explicitly request it through intrinsics?

On Thu, May 31, 2012 at 2:20 AM, Elena Demikhovsky <
elena.demikhovsky at intel.com> wrote:

> Author: delena
> Date: Thu May 31 04:20:20 2012
> New Revision: 157737
>
> URL: http://llvm.org/viewvc/llvm-project?rev=157737&view=rev
> Log:
> Added FMA3 Intel instructions.
> I disabled FMA3 autodetection, since the result may differ from expected
> for some benchmarks.
> I added tests for GodeGen and intrinsics.
> I did not change llvm.fma.f32/64 - it may be done later.
>
>
> Added:
>    llvm/trunk/test/CodeGen/X86/fma3-intrinsics.ll   (with props)
>    llvm/trunk/test/CodeGen/X86/fma3.ll   (with props)
> Modified:
>    llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
>    llvm/trunk/lib/Target/X86/X86CodeEmitter.cpp
>    llvm/trunk/lib/Target/X86/X86InstrFMA.td
>    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
>    llvm/trunk/lib/Target/X86/X86InstrInfo.h
>    llvm/trunk/lib/Target/X86/X86Subtarget.cpp
>
> Modified: llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp?rev=157737&r1=157736&r2=157737&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp (original)
> +++ llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp Thu May 31
> 04:20:20 2012
> @@ -570,7 +570,11 @@
>   }
>
>   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
> +  unsigned NumOps = Desc.getNumOperands();
>   unsigned CurOp = 0;
> +  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1)
> +    ++CurOp;
> +
>   switch (TSFlags & X86II::FormMask) {
>   case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
>   case X86II::MRMDestMem: {
> @@ -603,11 +607,11 @@
>     //  FMA4:
>     //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
>     //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
> -    if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
> +    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg()))
>       VEX_R = 0x0;
>
>     if (HasVEX_4V)
> -      VEX_4V = getVEXRegisterEncoding(MI, 1);
> +      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
>
>     if (X86II::isX86_64ExtendedReg(
>                MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
>
> Modified: llvm/trunk/lib/Target/X86/X86CodeEmitter.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CodeEmitter.cpp?rev=157737&r1=157736&r2=157737&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86CodeEmitter.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86CodeEmitter.cpp Thu May 31 04:20:20 2012
> @@ -933,7 +933,10 @@
>   }
>
>   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
> +  unsigned NumOps = Desc->getNumOperands();
>   unsigned CurOp = 0;
> +  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) != -1)
> +    ++CurOp;
>   switch (TSFlags & X86II::FormMask) {
>     case X86II::MRMInitReg:
>       // Duplicate register.
>
> Modified: llvm/trunk/lib/Target/X86/X86InstrFMA.td
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFMA.td?rev=157737&r1=157736&r2=157737&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrFMA.td (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrFMA.td Thu May 31 04:20:20 2012
> @@ -15,83 +15,358 @@
>  // FMA3 - Intel 3 operand Fused Multiply-Add instructions
>
>  //===----------------------------------------------------------------------===//
>
> +let Constraints = "$src1 = $dst" in {
>  multiclass fma3p_rm<bits<8> opc, string OpcodeStr> {
>   def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
> -           (ins VR128:$src1, VR128:$src2),
> -           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1,
> $src2}"),
> +           (ins VR128:$src1, VR128:$src2, VR128:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
>            []>;
> +  let mayLoad = 1 in
>   def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
> -           (ins VR128:$src1, f128mem:$src2),
> -           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1,
> $src2}"),
> +           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
>            []>;
>   def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
> -           (ins VR256:$src1, VR256:$src2),
> -           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1,
> $src2}"),
> +           (ins VR256:$src1, VR256:$src2, VR256:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
>            []>;
> +  let mayLoad = 1 in
>   def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
> -           (ins VR256:$src1, f256mem:$src2),
> -           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1,
> $src2}"),
> +           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
>            []>;
>  }
>
> +// Intrinsic for 132 pattern
> +multiclass fma3p_rm_int<bits<8> opc, string OpcodeStr,
> +                        PatFrag MemFrag128, PatFrag MemFrag256,
> +                        Intrinsic Int128, Intrinsic Int256> {
> +  def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
> +           (ins VR128:$src1, VR128:$src2, VR128:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
> +           [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src3,
> VR128:$src2))]>;
> +  //let mayLoad = 1 in
> +  def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
> +           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
> +           [(set VR128:$dst, (Int128 VR128:$src1, (MemFrag128
> addr:$src3), VR128:$src2))]>;
> +  def rY_Int : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
> +           (ins VR256:$src1, VR256:$src2, VR256:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
> +           [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src3,
> VR256:$src2))]>;
> +  //let mayLoad = 1 in
> +  def mY_Int : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
> +           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
> +           [(set VR256:$dst, (Int256 VR256:$src1, (MemFrag256
> addr:$src3), VR256:$src2))]>;
> +}
> +}
> +
>  multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
> -                       string OpcodeStr, string PackTy> {
> -  defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, !strconcat("132",
> PackTy))>;
> -  defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, !strconcat("213",
> PackTy))>;
> -  defm r231 : fma3p_rm<opc231, !strconcat(OpcodeStr, !strconcat("231",
> PackTy))>;
> +                       string OpcodeStr, string PackTy,
> +                       PatFrag MemFrag128, PatFrag MemFrag256,
> +                       Intrinsic Int128, Intrinsic Int256> {
> +  defm r132 : fma3p_rm_int <opc132, !strconcat(OpcodeStr,
> !strconcat("132", PackTy)),
> +                              MemFrag128, MemFrag256, Int128, Int256>;
> +  defm r132 : fma3p_rm     <opc132, !strconcat(OpcodeStr,
> !strconcat("132", PackTy))>;
> +  defm r213 : fma3p_rm     <opc213, !strconcat(OpcodeStr,
> !strconcat("213", PackTy))>;
> +  defm r231 : fma3p_rm     <opc231, !strconcat(OpcodeStr,
> !strconcat("231", PackTy))>;
>  }
>
>  // Fused Multiply-Add
>  let ExeDomain = SSEPackedSingle in {
> -  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps">;
> -  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps">;
> -  defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps">;
> -  defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps">;
> +  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps",
> memopv4f32, memopv8f32,
> +    int_x86_fma4_vfmadd_ps, int_x86_fma4_vfmadd_ps_256>;
> +  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps",
>  memopv4f32, memopv8f32,
> +    int_x86_fma4_vfmsub_ps, int_x86_fma4_vfmsub_ps_256>;
> +  defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
> memopv4f32, memopv8f32,
> +    int_x86_fma4_vfmaddsub_ps, int_x86_fma4_vfmaddsub_ps_256>;
> +  defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
> memopv4f32, memopv8f32,
> +    int_x86_fma4_vfmsubadd_ps, int_x86_fma4_vfmaddsub_ps_256>;
>  }
>
>  let ExeDomain = SSEPackedDouble in {
> -  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd">, VEX_W;
> -  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd">, VEX_W;
> -  defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd">,
> VEX_W;
> -  defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd">,
> VEX_W;
> +  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd",
> memopv2f64, memopv4f64,
> +    int_x86_fma4_vfmadd_pd, int_x86_fma4_vfmadd_pd_256>, VEX_W;
> +  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd",
> memopv2f64, memopv4f64,
> +    int_x86_fma4_vfmsub_pd, int_x86_fma4_vfmsub_pd_256>, VEX_W;
> +  defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
> memopv2f64, memopv4f64,
> +    int_x86_fma4_vfmaddsub_pd, int_x86_fma4_vfmaddsub_pd_256>, VEX_W;
> +  defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
> memopv2f64, memopv4f64,
> +    int_x86_fma4_vfmsubadd_pd, int_x86_fma4_vfmsubadd_pd_256>, VEX_W;
>  }
>
>  // Fused Negative Multiply-Add
>  let ExeDomain = SSEPackedSingle in {
> -  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps">;
> -  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">;
> +  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",
>  memopv4f32, memopv8f32,
> +    int_x86_fma4_vfnmadd_ps, int_x86_fma4_vfnmadd_ps_256>;
> +  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",
>  memopv4f32, memopv8f32,
> +    int_x86_fma4_vfnmsub_ps, int_x86_fma4_vfnmsub_ps_256>;
>  }
>  let ExeDomain = SSEPackedDouble in {
> -  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd">, VEX_W;
> -  defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W;
> +  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd",
> memopv2f64, memopv4f64,
> +    int_x86_fma4_vfnmadd_pd, int_x86_fma4_vfnmadd_pd_256>, VEX_W;
> +  defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
> memopv2f64, memopv4f64,
> +    int_x86_fma4_vfnmsub_pd, int_x86_fma4_vfnmsub_pd_256>, VEX_W;
>  }
>
> -multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand
> x86memop> {
> -  def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
> -           (ins VR128:$src1, VR128:$src2),
> -           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1,
> $src2}"),
> +let Predicates = [HasFMA3], AddedComplexity = 20 in {
> +//------------
> +// FP double precision ADD - 256
> +//------------
> +
> +// FMA231: src1 = src2*src3 + src1
> +def : Pat<(v4f64 (fadd (fmul VR256:$src2, (memopv4f64 addr:$src3)),
> VR256:$src1)),
> +        (VFMADDPDr231mY VR256:$src1, VR256:$src2, addr:$src3)>;
> +
> +// FMA231: src1 = src2*src3 + src1
> +def : Pat<(v4f64 (fadd (fmul VR256:$src2, VR256:$src3), VR256:$src1)),
> +        (VFMADDPDr231rY VR256:$src1, VR256:$src2, VR256:$src3)>;
> +
> +
> +//------------
> +// FP double precision ADD - 128
> +//------------
> +
> +
> +// FMA231: src1 = src2*src3 + src1
> +def : Pat<(v2f64 (fadd (fmul VR128:$src2, (memopv2f64 addr:$src3)),
> VR128:$src1)),
> +            (VFMADDPDr231m VR128:$src1, VR128:$src2, addr:$src3)>;
> +
> +// FMA231: src1 = src2*src3 + src1
> +def : Pat<(v2f64 (fadd (fmul VR128:$src2, VR128:$src3), VR128:$src1)),
> +            (VFMADDPDr231r VR128:$src1, VR128:$src2, VR128:$src3)>;
> +
> +//------------
> +// FP double precision SUB - 256
> +//------------
> +// FMA231: src1 = src2*src3 - src1
> +def : Pat<(v4f64 (fsub (fmul VR256:$src2, (memopv4f64 addr:$src3)),
> VR256:$src1)),
> +           (VFMSUBPDr231mY VR256:$src1, VR256:$src2, addr:$src3)>;
> +
> +// FMA231: src1 = src2*src3 - src1
> +def : Pat<(v4f64 (fsub (fmul VR256:$src2, VR256:$src3), VR256:$src1)),
> +            (VFMSUBPDr231rY VR256:$src1, VR256:$src2, VR256:$src3)>;
> +
> +
> +//------------
> +// FP double precision SUB - 128
> +//------------
> +
> +// FMA231: src1 = src2*src3 - src1
> +def : Pat<(v2f64 (fsub (fmul VR128:$src2, (memopv2f64 addr:$src3)),
> VR128:$src1)),
> +            (VFMSUBPDr231m VR128:$src1, VR128:$src2, addr:$src3)>;
> +
> +// FMA231: src1 = src2*src3 - src1
> +def : Pat<(v2f64 (fsub (fmul VR128:$src2, VR128:$src3), VR128:$src1)),
> +            (VFMSUBPDr231r VR128:$src1, VR128:$src2, VR128:$src3)>;
> +
> +//------------
> +// FP double precision FNMADD - 256
> +//------------
> +// FMA231: src1 = - src2*src3 + src1
> +def : Pat<(v4f64 (fsub VR256:$src1, (fmul VR256:$src2, (memopv4f64
> addr:$src3)))),
> +            (VFNMADDPDr231mY VR256:$src1, VR256:$src2, addr:$src3)>;
> +
> +// FMA231: src1 = - src2*src3 + src1
> +def : Pat<(v4f64 (fsub VR256:$src1, (fmul VR256:$src2, VR256:$src3))),
> +            (VFNMADDPDr231rY VR256:$src1, VR256:$src2, VR256:$src3)>;
> +
> +//------------
> +// FP double precision FNMADD - 128
> +//------------
> +
> +// FMA231: src1 = - src2*src3 + src1
> +def : Pat<(v2f64 (fsub VR128:$src1, (fmul VR128:$src2, (memopv2f64
> addr:$src3)))),
> +            (VFNMADDPDr231m VR128:$src1, VR128:$src2, addr:$src3)>;
> +
> +// FMA231: src1 = - src2*src3 + src1
> +def : Pat<(v2f64 (fsub VR128:$src1, (fmul VR128:$src2, VR128:$src3))),
> +            (VFNMADDPDr231r VR128:$src1, VR128:$src2, VR128:$src3)>;
> +
> +//------------
> +// FP single precision ADD - 256
> +//------------
> +
> +// FMA231: src1 = src2*src3 + src1
> +def : Pat<(v8f32 (fadd (fmul VR256:$src2, VR256:$src3), VR256:$src1)),
> +            (VFMADDPSr231rY VR256:$src1, VR256:$src2, VR256:$src3)>;
> +
> +// FMA213 : src1 = src2*src1 + src3
> +def : Pat<(v8f32 (fadd (fmul VR256:$src1, VR256:$src2), (memopv8f32
> addr:$src3))),
> +            (VFMADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
> +
> +// FMA231: src1 = src2*src3 + src1
> +def : Pat<(v8f32 (fadd (fmul (memopv8f32 addr:$src3), VR256:$src2),
> VR256:$src1)),
> +            (VFMADDPSr231mY VR256:$src1, VR256:$src2, addr:$src3)>;
> +
> +// FMA213: src1 = src2*src1 + src3
> +def : Pat<(v8f32 (fadd (fmul VR256:$src2, VR256:$src1), VR256:$src3)),
> +            (VFMADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
> +
> +//------------
> +// FP single precision ADD - 128
> +//------------
> +
> +// FMA231 : src1 = src2*src3 + src1
> +def : Pat<(v4f32 (fadd (fmul VR128:$src2, (memopv4f32 addr:$src3)),
> VR128:$src1)),
> +            (VFMADDPSr231m VR128:$src1, VR128:$src2, addr:$src3)>;
> +
> +// FMA231 : src1 = src2*src3 + src1
> +def : Pat<(v4f32 (fadd (fmul VR128:$src2, VR128:$src3), VR128:$src1)),
> +        (VFMADDPSr231r VR128:$src1, VR128:$src2, VR128:$src3)>;
> +
> +//------------
> +// FP single precision SUB - 256
> +//------------
> +// FMA231: src1 = src2*src3 - src1
> +def : Pat<(v8f32 (fsub (fmul VR256:$src2, (memopv8f32 addr:$src3)),
> VR256:$src1)),
> +            (VFMSUBPSr231mY VR256:$src1, VR256:$src2, addr:$src3)>;
> +
> +// FMA231: src1 = src2*src3 - src1
> +def : Pat<(v8f32 (fsub (fmul VR256:$src2, VR256:$src3), VR256:$src1)),
> +            (VFMSUBPSr231rY VR256:$src1, VR256:$src2, VR256:$src3)>;
> +
> +//------------
> +// FP single precision SUB - 128
> +//------------
> +// FMA231 : src1 = src2*src3 - src1
> +def : Pat<(v4f32 (fsub (fmul VR128:$src2, (memopv4f32 addr:$src3)),
> VR128:$src1)),
> +            (VFMSUBPSr231m VR128:$src1, VR128:$src2, addr:$src3)>;
> +
> +// FMA231 : src1 = src2*src3 - src1
> +def : Pat<(v4f32 (fsub (fmul VR128:$src2, VR128:$src3), VR128:$src1)),
> +            (VFMSUBPSr231r VR128:$src1, VR128:$src2, VR128:$src3)>;
> +
> +//------------
> +// FP single precision FNMADD - 256
> +//------------
> +// FMA231: src1 = - src2*src3 + src1
> +def : Pat<(v8f32 (fsub VR256:$src1, (fmul VR256:$src2, (memopv8f32
> addr:$src3)))),
> +            (VFNMADDPSr231mY VR256:$src1, VR256:$src2, addr:$src3)>;
> +
> +// FMA231: src1 = - src2*src3 + src1
> +def : Pat<(v8f32 (fsub VR256:$src1, (fmul VR256:$src2, VR256:$src3))),
> +            (VFNMADDPSr231rY VR256:$src1, VR256:$src2, VR256:$src3)>;
> +
> +//------------
> +// FP single precision FNMADD - 128
> +//------------
> +
> +// FMA231 : src1 = src2*src3 - src1
> +def : Pat<(v4f32 (fsub VR128:$src1, (fmul VR128:$src2, (memopv4f32
> addr:$src3)))),
> +            (VFNMADDPSr231m VR128:$src1, VR128:$src2, addr:$src3)>;
> +
> +// FMA231 : src1 = src2*src3 - src1
> +def : Pat<(v4f32 (fsub VR128:$src1, (fmul VR128:$src2, VR128:$src3))),
> +            (VFNMADDPSr231r VR128:$src1, VR128:$src2, VR128:$src3)>;
> +
> +} // HasFMA3
> +
> +//------------------------------
> +// SCALAR
> +//------------------------------
> +
> +let Constraints = "$src1 = $dst" in {
> +multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand
> x86memop, RegisterClass RC> {
> +  def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
> +           (ins RC:$src1, RC:$src2, RC:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
>            []>;
> -  def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
> -           (ins VR128:$src1, x86memop:$src2),
> -           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1,
> $src2}"),
> +  def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
> +           (ins RC:$src1, RC:$src2, x86memop:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
>            []>;
>  }
>
> +multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,X86MemOperand
> x86memop, RegisterClass RC,
> +           Intrinsic IntId> {
> +  def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
> +           (ins RC:$src1, RC:$src2, RC:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
> +           [(set RC:$dst, (IntId RC:$src1, RC:$src3, RC:$src2))]>;
> +  def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),
> +           (ins RC:$src1, VR128:$src2, x86memop:$src3),
> +           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2,
> $src3}"),
> +           [(set RC:$dst, (IntId RC:$src1, (load addr:$src3),
> RC:$src2))]>;
> +}
> +}
> +
>  multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
> -                       string OpcodeStr> {
> -  defm SSr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132ss"), f32mem>;
> -  defm SSr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213ss"), f32mem>;
> -  defm SSr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231ss"), f32mem>;
> -  defm SDr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132sd"), f64mem>,
> VEX_W;
> -  defm SDr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213sd"), f64mem>,
> VEX_W;
> -  defm SDr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231sd"), f64mem>,
> VEX_W;
> +                       string OpcodeStr, string PackTy, X86MemOperand
> MemOp,
> +                       RegisterClass RC, Intrinsic IntId> {
> +  defm r132    : fma3s_rm         <opc132, !strconcat(OpcodeStr,
> !strconcat("132", PackTy)), MemOp, RC>;
> +  defm r213    : fma3s_rm         <opc213, !strconcat(OpcodeStr,
> !strconcat("213", PackTy)), MemOp, RC>;
> +  defm r231    : fma3s_rm         <opc231, !strconcat(OpcodeStr,
> !strconcat("231", PackTy)), MemOp, RC>;
> +  defm r132_Int: fma3s_rm_int     <opc132, !strconcat(OpcodeStr,
> !strconcat("132", PackTy)), MemOp, VR128, IntId>;
>  }
>
> -defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd">, VEX_LIG;
> -defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub">, VEX_LIG;
> +defm VFMADDSS : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd", "ss", f32mem,
> FR32, int_x86_fma4_vfmadd_ss>, VEX_LIG;
> +defm VFMADDSD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd", "sd", f64mem,
> FR64, int_x86_fma4_vfmadd_sd>, VEX_W, VEX_LIG;
> +defm VFMSUBSS : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub", "ss", f32mem,
> FR32, int_x86_fma4_vfmsub_ss>, VEX_LIG;
> +defm VFMSUBSD : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub", "sd", f64mem,
> FR64, int_x86_fma4_vfmsub_sd>, VEX_W, VEX_LIG;
> +
> +defm VFNMADDSS : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd", "ss", f32mem,
> FR32, int_x86_fma4_vfnmadd_ss>, VEX_LIG;
> +defm VFNMADDSD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd", "sd", f64mem,
> FR64, int_x86_fma4_vfnmadd_sd>, VEX_W, VEX_LIG;
> +defm VFNMSUBSS : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub", "ss", f32mem,
> FR32, int_x86_fma4_vfnmsub_ss>, VEX_LIG;
> +defm VFNMSUBSD : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub", "sd", f64mem,
> FR64, int_x86_fma4_vfnmsub_sd>, VEX_W, VEX_LIG;
> +
> +
> +let Predicates = [HasFMA3], AddedComplexity = 20 in {
> +
> +//------------
> +// FP  scalar ADD
> +//------------
>
> -defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd">, VEX_LIG;
> -defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub">, VEX_LIG;
> +
> +// FMADD231 : src1 = src2*src3 + src1
> +def : Pat<(f32 (fadd (fmul FR32:$src2, FR32:$src3), FR32:$src1)),
> +            (VFMADDSSr231r FR32:$src1, FR32:$src2, FR32:$src3)>;
> +
> +def : Pat<(f32 (fadd (fmul FR32:$src2, (loadf32 addr:$src3)),
> FR32:$src1)),
> +            (VFMADDSSr231m FR32:$src1, FR32:$src2, addr:$src3)>;
> +
> +def : Pat<(f64 (fadd (fmul FR64:$src2, FR64:$src3), FR64:$src1)),
> +            (VFMADDSDr231r FR64:$src1, FR64:$src2, FR64:$src3)>;
> +
> +def : Pat<(f64 (fadd (fmul FR64:$src2, (loadf64 addr:$src3)),
> FR64:$src1)),
> +            (VFMADDSDr231m FR64:$src1, FR64:$src2, addr:$src3)>;
> +
> +
> +
> +//------------
> +// FP  scalar SUB src2*src3 - src1
> +//------------
> +
> +def : Pat<(f32 (fsub (fmul FR32:$src2, FR32:$src3), FR32:$src1)),
> +            (VFMSUBSSr231r FR32:$src1, FR32:$src2, FR32:$src3)>;
> +
> +def : Pat<(f32 (fsub (fmul FR32:$src2, (loadf32 addr:$src3)),
> FR32:$src1)),
> +            (VFMSUBSSr231m FR32:$src1, FR32:$src2, addr:$src3)>;
> +
> +def : Pat<(f64 (fsub (fmul FR64:$src2, FR64:$src3), FR64:$src1)),
> +            (VFMSUBSDr231r FR64:$src1, FR64:$src2, FR64:$src3)>;
> +
> +def : Pat<(f64 (fsub (fmul FR64:$src2, (loadf64 addr:$src3)),
> FR64:$src1)),
> +            (VFMSUBSDr231m FR64:$src1, FR64:$src2, addr:$src3)>;
> +
> +//------------
> +// FP  scalar NADD src1 - src2*src3
> +//------------
> +
> +def : Pat<(f32 (fsub FR32:$src1, (fmul FR32:$src2, FR32:$src3))),
> +            (VFNMADDSSr231r FR32:$src1, FR32:$src2, FR32:$src3)>;
> +
> +def : Pat<(f32 (fsub FR32:$src1, (fmul FR32:$src2, (loadf32
> addr:$src3)))),
> +            (VFNMADDSSr231m FR32:$src1, FR32:$src2, addr:$src3)>;
> +
> +def : Pat<(f64 (fsub FR64:$src1, (fmul FR64:$src2, FR64:$src3))),
> +            (VFNMADDSDr231r FR64:$src1, FR64:$src2, FR64:$src3)>;
> +
> +def : Pat<(f64 (fsub FR64:$src1, (fmul FR64:$src2, (loadf64
> addr:$src3)))),
> +            (VFNMADDSDr231m FR64:$src1, FR64:$src2, addr:$src3)>;
> +
> +} // HasFMA3
>
>
>  //===----------------------------------------------------------------------===//
>  // FMA4 - AMD 4 operand Fused Multiply-Add instructions
> @@ -178,6 +453,8 @@
>  } // isCodeGenOnly = 1
>  }
>
> +let Predicates = [HasFMA4] in {
> +
>  defm VFMADDSS4    : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32,
>                           int_x86_fma4_vfmadd_ss>;
>  defm VFMADDSD4    : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64,
> @@ -218,3 +495,5 @@
>                          int_x86_fma4_vfmsubadd_ps_256, memopv4f32,
> memopv8f32>;
>  defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma4_vfmsubadd_pd,
>                          int_x86_fma4_vfmsubadd_pd_256, memopv2f64,
> memopv4f64>;
> +} // HasFMA4
> +
>
> Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=157737&r1=157736&r2=157737&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Thu May 31 04:20:20 2012
> @@ -58,6 +58,7 @@
>   TB_INDEX_0    = 0,
>   TB_INDEX_1    = 1,
>   TB_INDEX_2    = 2,
> +  TB_INDEX_3    = 3,
>   TB_INDEX_MASK = 0xff,
>
>   // Minimum alignment required for load/store.
> @@ -1122,6 +1123,75 @@
>                   // Index 2, folded load
>                   Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
>   }
> +
> +  static const X86OpTblEntry OpTbl3[] = {
> +    // FMA foldable instructions
> +    { X86::VFMADDSSr231r,     X86::VFMADDSSr231m,      0 },
> +    { X86::VFMADDSDr231r,     X86::VFMADDSDr231m,      0 },
> +    { X86::VFMADDSSr132r,     X86::VFMADDSSr132m,      0 },
> +    { X86::VFMADDSDr132r,     X86::VFMADDSDr132m,      0 },
> +
> +    { X86::VFMADDPSr231r,     X86::VFMADDPSr231m,      TB_ALIGN_16 },
> +    { X86::VFMADDPDr231r,     X86::VFMADDPDr231m,      TB_ALIGN_16 },
> +    { X86::VFMADDPSr132r,     X86::VFMADDPSr132m,      TB_ALIGN_16 },
> +    { X86::VFMADDPDr132r,     X86::VFMADDPDr132m,      TB_ALIGN_16 },
> +    { X86::VFMADDPSr213r,     X86::VFMADDPSr213m,      TB_ALIGN_16 },
> +    { X86::VFMADDPDr213r,     X86::VFMADDPDr213m,      TB_ALIGN_16 },
> +    { X86::VFMADDPSr231rY,    X86::VFMADDPSr231mY,     TB_ALIGN_32 },
> +    { X86::VFMADDPDr231rY,    X86::VFMADDPDr231mY,     TB_ALIGN_32 },
> +    { X86::VFMADDPSr132rY,    X86::VFMADDPSr132mY,     TB_ALIGN_32 },
> +    { X86::VFMADDPDr132rY,    X86::VFMADDPDr132mY,     TB_ALIGN_32 },
> +    { X86::VFMADDPSr213rY,    X86::VFMADDPSr213mY,     TB_ALIGN_32 },
> +    { X86::VFMADDPDr213rY,    X86::VFMADDPDr213mY,     TB_ALIGN_32 },
> +
> +    { X86::VFNMADDSSr231r,    X86::VFNMADDSSr231m,     0 },
> +    { X86::VFNMADDSDr231r,    X86::VFNMADDSDr231m,     0 },
> +    { X86::VFNMADDSSr132r,    X86::VFNMADDSSr132m,     0 },
> +    { X86::VFNMADDSDr132r,    X86::VFNMADDSDr132m,     0 },
> +
> +    { X86::VFNMADDPSr231r,    X86::VFNMADDPSr231m,     TB_ALIGN_16 },
> +    { X86::VFNMADDPDr231r,    X86::VFNMADDPDr231m,     TB_ALIGN_16 },
> +    { X86::VFNMADDPSr132r,    X86::VFNMADDPSr132m,     TB_ALIGN_16 },
> +    { X86::VFNMADDPDr132r,    X86::VFNMADDPDr132m,     TB_ALIGN_16 },
> +    { X86::VFNMADDPSr213r,    X86::VFNMADDPSr213m,     TB_ALIGN_16 },
> +    { X86::VFNMADDPDr213r,    X86::VFNMADDPDr213m,     TB_ALIGN_16 },
> +    { X86::VFNMADDPSr231rY,   X86::VFNMADDPSr231mY,    TB_ALIGN_32 },
> +    { X86::VFNMADDPDr231rY,   X86::VFNMADDPDr231mY,    TB_ALIGN_32 },
> +    { X86::VFNMADDPSr132rY,   X86::VFNMADDPSr132mY,    TB_ALIGN_32 },
> +    { X86::VFNMADDPDr132rY,   X86::VFNMADDPDr132mY,    TB_ALIGN_32 },
> +    { X86::VFNMADDPSr213rY,   X86::VFNMADDPSr213mY,    TB_ALIGN_32 },
> +    { X86::VFNMADDPDr213rY,   X86::VFNMADDPDr213mY,    TB_ALIGN_32 },
> +
> +    { X86::VFMSUBSSr231r,     X86::VFMSUBSSr231m,      0 },
> +    { X86::VFMSUBSDr231r,     X86::VFMSUBSDr231m,      0 },
> +    { X86::VFMSUBSSr132r,     X86::VFMSUBSSr132m,      0 },
> +    { X86::VFMSUBSDr132r,     X86::VFMSUBSDr132m,      0 },
> +
> +    { X86::VFMSUBPSr231r,     X86::VFMSUBPSr231m,      TB_ALIGN_16 },
> +    { X86::VFMSUBPDr231r,     X86::VFMSUBPDr231m,      TB_ALIGN_16 },
> +    { X86::VFMSUBPSr132r,     X86::VFMSUBPSr132m,      TB_ALIGN_16 },
> +    { X86::VFMSUBPDr132r,     X86::VFMSUBPDr132m,      TB_ALIGN_16 },
> +    { X86::VFMSUBPSr213r,     X86::VFMSUBPSr213m,      TB_ALIGN_16 },
> +    { X86::VFMSUBPDr213r,     X86::VFMSUBPDr213m,      TB_ALIGN_16 },
> +    { X86::VFMSUBPSr231rY,    X86::VFMSUBPSr231mY,     TB_ALIGN_32 },
> +    { X86::VFMSUBPDr231rY,    X86::VFMSUBPDr231mY,     TB_ALIGN_32 },
> +    { X86::VFMSUBPSr132rY,    X86::VFMSUBPSr132mY,     TB_ALIGN_32 },
> +    { X86::VFMSUBPDr132rY,    X86::VFMSUBPDr132mY,     TB_ALIGN_32 },
> +    { X86::VFMSUBPSr213rY,    X86::VFMSUBPSr213mY,     TB_ALIGN_32 },
> +    { X86::VFMSUBPDr213rY,    X86::VFMSUBPDr213mY,     TB_ALIGN_32 },
> +
> +  };
> +
> +  for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
> +    unsigned RegOp = OpTbl3[i].RegOp;
> +    unsigned MemOp = OpTbl3[i].MemOp;
> +    unsigned Flags = OpTbl3[i].Flags;
> +    AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
> +                  RegOp, MemOp,
> +                  // Index 3, folded load
> +                  Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
> +  }
> +
>  }
>
>  void
>
> Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.h?rev=157737&r1=157736&r2=157737&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrInfo.h (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrInfo.h Thu May 31 04:20:20 2012
> @@ -128,7 +128,8 @@
>   X86TargetMachine &TM;
>   const X86RegisterInfo RI;
>
> -  /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
> +  /// RegOp2MemOpTable3Addr, RegOp2MemOpTable2Addr,
> +  /// RegOp2MemOpTable0, RegOp2MemOpTable1,
>   /// RegOp2MemOpTable2 - Load / store folding opcode maps.
>   ///
>   typedef DenseMap<unsigned,
> @@ -137,6 +138,7 @@
>   RegOp2MemOpTableType RegOp2MemOpTable0;
>   RegOp2MemOpTableType RegOp2MemOpTable1;
>   RegOp2MemOpTableType RegOp2MemOpTable2;
> +  RegOp2MemOpTableType RegOp2MemOpTable3;
>
>   /// MemOp2RegOpTable - Load / store unfolding opcode map.
>   ///
>
> Modified: llvm/trunk/lib/Target/X86/X86Subtarget.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.cpp?rev=157737&r1=157736&r2=157737&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86Subtarget.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp Thu May 31 04:20:20 2012
> @@ -205,10 +205,12 @@
>     HasCLMUL = true;
>     ToggleFeature(X86::FeatureCLMUL);
>   }
> -  if ((ECX >> 12) & 0x1) {
> -    HasFMA3 = true;
> -    ToggleFeature(X86::FeatureFMA3);
> -  }
> +  // FMA3 autodetection is switched off until we have a special flag
> +  // in code generator
> +  //if ((ECX >> 12) & 0x1) {
> +  //  HasFMA3 = true;
> +  //  ToggleFeature(X86::FeatureFMA3);
> +  //}
>   if (IsIntel && ((ECX >> 22) & 0x1)) {
>     HasMOVBE = true;
>     ToggleFeature(X86::FeatureMOVBE);
>
> Added: llvm/trunk/test/CodeGen/X86/fma3-intrinsics.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fma3-intrinsics.ll?rev=157737&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/fma3-intrinsics.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/fma3-intrinsics.ll Thu May 31 04:20:20 2012
> @@ -0,0 +1,132 @@
> +; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2
> -mattr=avx2,+fma3 | FileCheck %s
> +
> +define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1,
> <4 x float> %a2) {
> +  ; CHECK: fmadd132ss {{.*\(%r.*}}, %xmm
> +  %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a0, <4 x
> float> %a1, <4 x float> %a2) nounwind
> +  ret <4 x float> %res
> +}
> +declare <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float>, <4 x float>, <4
> x float>) nounwind readnone
> +
> +define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1,
> <4 x float> %a2) {
> +  ; CHECK: fmadd132ps
> +  %res = call <4 x float> @llvm.x86.fma4.vfmadd.ps(<4 x float> %a0, <4 x
> float> %a1, <4 x float> %a2) nounwind
> +  ret <4 x float> %res
> +}
> +declare <4 x float> @llvm.x86.fma4.vfmadd.ps(<4 x float>, <4 x float>,
> <4 x float>) nounwind readnone
> +
> +define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1,
> <8 x float> %a2) {
> +  ; CHECK: fmadd132ps {{.*\(%r.*}}, %ymm
> +  %res = call <8 x float> @llvm.x86.fma4.vfmadd.ps.256(<8 x float> %a0,
> <8 x float> %a1, <8 x float> %a2) nounwind
> +  ret <8 x float> %res
> +}
> +declare <8 x float> @llvm.x86.fma4.vfmadd.ps.256(<8 x float>, <8 x
> float>, <8 x float>) nounwind readnone
> +
> +define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1,
> <4 x float> %a2) {
> +  ; CHECK: fnmadd132ss {{.*\(%r.*}}, %xmm
> +  %res = call <4 x float> @llvm.x86.fma4.vfnmadd.ss(<4 x float> %a0, <4 x
> float> %a1, <4 x float> %a2) nounwind
> +  ret <4 x float> %res
> +}
> +declare <4 x float> @llvm.x86.fma4.vfnmadd.ss(<4 x float>, <4 x float>,
> <4 x float>) nounwind readnone
> +
> +define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1,
> <4 x float> %a2) {
> +  ; CHECK: fnmadd132ps
> +  %res = call <4 x float> @llvm.x86.fma4.vfnmadd.ps(<4 x float> %a0, <4
> x float> %a1, <4 x float> %a2) nounwind
> +  ret <4 x float> %res
> +}
> +declare <4 x float> @llvm.x86.fma4.vfnmadd.ps(<4 x float>, <4 x float>,
> <4 x float>) nounwind readnone
> +
> +define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float>
> %a1, <8 x float> %a2) {
> +  ; CHECK: fnmadd132ps {{.*\(%r.*}}, %ymm
> +  %res = call <8 x float> @llvm.x86.fma4.vfnmadd.ps.256(<8 x float> %a0,
> <8 x float> %a1, <8 x float> %a2) nounwind
> +  ret <8 x float> %res
> +}
> +declare <8 x float> @llvm.x86.fma4.vfnmadd.ps.256(<8 x float>, <8 x
> float>, <8 x float>) nounwind readnone
> +
> +
> +define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1,
> <4 x float> %a2) {
> +  ; CHECK: fmsub132ss
> +  %res = call <4 x float> @llvm.x86.fma4.vfmsub.ss(<4 x float> %a0, <4 x
> float> %a1, <4 x float> %a2) nounwind
> +  ret <4 x float> %res
> +}
> +declare <4 x float> @llvm.x86.fma4.vfmsub.ss(<4 x float>, <4 x float>, <4
> x float>) nounwind readnone
> +
> +define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1,
> <4 x float> %a2) {
> +  ; CHECK: fmsub132ps
> +  %res = call <4 x float> @llvm.x86.fma4.vfmsub.ps(<4 x float> %a0, <4 x
> float> %a1, <4 x float> %a2) nounwind
> +  ret <4 x float> %res
> +}
> +declare <4 x float> @llvm.x86.fma4.vfmsub.ps(<4 x float>, <4 x float>,
> <4 x float>) nounwind readnone
> +
> +define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1,
> <4 x float> %a2) {
> +  ; CHECK: fnmsub132ss
> +  %res = call <4 x float> @llvm.x86.fma4.vfnmsub.ss(<4 x float> %a0, <4 x
> float> %a1, <4 x float> %a2) nounwind
> +  ret <4 x float> %res
> +}
> +declare <4 x float> @llvm.x86.fma4.vfnmsub.ss(<4 x float>, <4 x float>,
> <4 x float>) nounwind readnone
> +
> +define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1,
> <4 x float> %a2) {
> +  ; CHECK: fnmsub132ps
> +  %res = call <4 x float> @llvm.x86.fma4.vfnmsub.ps(<4 x float> %a0, <4
> x float> %a1, <4 x float> %a2) nounwind
> +  ret <4 x float> %res
> +}
> +declare <4 x float> @llvm.x86.fma4.vfnmsub.ps(<4 x float>, <4 x float>,
> <4 x float>) nounwind readnone
> +
> +;;;;
> +
> +define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double>
> %a1, <2 x double> %a2) {
> +  ; CHECK: fmadd132sd
> +  %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a0, <2
> x double> %a1, <2 x double> %a2) nounwind
> +  ret <2 x double> %res
> +}
> +declare <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double>, <2 x
> double>, <2 x double>) nounwind readnone
> +
> +define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double>
> %a1, <2 x double> %a2) {
> +  ; CHECK: fmadd132pd
> +  %res = call <2 x double> @llvm.x86.fma4.vfmadd.pd(<2 x double> %a0, <2
> x double> %a1, <2 x double> %a2) nounwind
> +  ret <2 x double> %res
> +}
> +declare <2 x double> @llvm.x86.fma4.vfmadd.pd(<2 x double>, <2 x double>,
> <2 x double>) nounwind readnone
> +
> +define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double>
> %a1, <2 x double> %a2) {
> +  ; CHECK: fnmadd132sd
> +  %res = call <2 x double> @llvm.x86.fma4.vfnmadd.sd(<2 x double> %a0,
> <2 x double> %a1, <2 x double> %a2) nounwind
> +  ret <2 x double> %res
> +}
> +declare <2 x double> @llvm.x86.fma4.vfnmadd.sd(<2 x double>, <2 x
> double>, <2 x double>) nounwind readnone
> +
> +define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double>
> %a1, <2 x double> %a2) {
> +  ; CHECK: fnmadd132pd
> +  %res = call <2 x double> @llvm.x86.fma4.vfnmadd.pd(<2 x double> %a0, <2
> x double> %a1, <2 x double> %a2) nounwind
> +  ret <2 x double> %res
> +}
> +declare <2 x double> @llvm.x86.fma4.vfnmadd.pd(<2 x double>, <2 x
> double>, <2 x double>) nounwind readnone
> +
> +
> +
> +define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double>
> %a1, <2 x double> %a2) {
> +  ; CHECK: fmsub132sd
> +  %res = call <2 x double> @llvm.x86.fma4.vfmsub.sd(<2 x double> %a0, <2
> x double> %a1, <2 x double> %a2) nounwind
> +  ret <2 x double> %res
> +}
> +declare <2 x double> @llvm.x86.fma4.vfmsub.sd(<2 x double>, <2 x
> double>, <2 x double>) nounwind readnone
> +
> +define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double>
> %a1, <2 x double> %a2) {
> +  ; CHECK: fmsub132pd
> +  %res = call <2 x double> @llvm.x86.fma4.vfmsub.pd(<2 x double> %a0, <2
> x double> %a1, <2 x double> %a2) nounwind
> +  ret <2 x double> %res
> +}
> +declare <2 x double> @llvm.x86.fma4.vfmsub.pd(<2 x double>, <2 x double>,
> <2 x double>) nounwind readnone
> +
> +define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double>
> %a1, <2 x double> %a2) {
> +  ; CHECK: fnmsub132sd
> +  %res = call <2 x double> @llvm.x86.fma4.vfnmsub.sd(<2 x double> %a0,
> <2 x double> %a1, <2 x double> %a2) nounwind
> +  ret <2 x double> %res
> +}
> +declare <2 x double> @llvm.x86.fma4.vfnmsub.sd(<2 x double>, <2 x
> double>, <2 x double>) nounwind readnone
> +
> +define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double>
> %a1, <2 x double> %a2) {
> +  ; CHECK: fnmsub132pd
> +  %res = call <2 x double> @llvm.x86.fma4.vfnmsub.pd(<2 x double> %a0, <2
> x double> %a1, <2 x double> %a2) nounwind
> +  ret <2 x double> %res
> +}
> +declare <2 x double> @llvm.x86.fma4.vfnmsub.pd(<2 x double>, <2 x
> double>, <2 x double>) nounwind readnone
>
> Propchange: llvm/trunk/test/CodeGen/X86/fma3-intrinsics.ll
>
> ------------------------------------------------------------------------------
>    svn:executable = *
>
> Added: llvm/trunk/test/CodeGen/X86/fma3.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fma3.ll?rev=157737&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/fma3.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/fma3.ll Thu May 31 04:20:20 2012
> @@ -0,0 +1,66 @@
> +; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2
> -mattr=avx2,+fma3 | FileCheck %s
> +
> +define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1,
> <4 x float> %a2) {
> +  ; CHECK: fmadd231ps {{.*\(%r.*}}, %xmm
> +  %x = fmul <4 x float> %a0, %a1
> +  %res = fadd <4 x float> %x, %a2
> +  ret <4 x float> %res
> +}
> +
> +define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1,
> <4 x float> %a2) {
> +  ; CHECK: fmsub231ps {{.*\(%r.*}}, %xmm
> +  %x = fmul <4 x float> %a0, %a1
> +  %res = fsub <4 x float> %x, %a2
> +  ret <4 x float> %res
> +}
> +
> +define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1,
> <4 x float> %a2) {
> +  ; CHECK: fnmadd231ps {{.*\(%r.*}}, %xmm
> +  %x = fmul <4 x float> %a0, %a1
> +  %res = fsub <4 x float> %a2, %x
> +  ret <4 x float> %res
> +}
> +
> +define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1,
> <8 x float> %a2) {
> +  ; CHECK: vfmadd213ps {{.*\(%r.*}}, %ymm
> +  %x = fmul <8 x float> %a0, %a1
> +  %res = fadd <8 x float> %x, %a2
> +  ret <8 x float> %res
> +}
> +
> +define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double>
> %a1, <4 x double> %a2) {
> +  ; CHECK: vfmadd231pd {{.*\(%r.*}}, %ymm
> +  %x = fmul <4 x double> %a0, %a1
> +  %res = fadd <4 x double> %x, %a2
> +  ret <4 x double> %res
> +}
> +
> +
> +define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1,
> <8 x float> %a2) {
> +  ; CHECK: fmsub231ps {{.*\(%r.*}}, %ymm
> +  %x = fmul <8 x float> %a0, %a1
> +  %res = fsub <8 x float> %x, %a2
> +  ret <8 x float> %res
> +}
> +
> +define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float>
> %a1, <8 x float> %a2) {
> +  ; CHECK: fnmadd231ps {{.*\(%r.*}}, %ymm
> +  %x = fmul <8 x float> %a0, %a1
> +  %res = fsub <8 x float> %a2, %x
> +  ret <8 x float> %res
> +}
> +
> +define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
> +  ; CHECK: vfnmadd231ss    %xmm1, %xmm0, %xmm2
> +  %x = fmul float %a0, %a1
> +  %res = fsub float %a2, %x
> +  ret float %res
> +}
> +
> +define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
> +  ; CHECK: vfnmadd231sd    %xmm1, %xmm0, %xmm2
> +  %x = fmul double %a0, %a1
> +  %res = fsub double %a2, %x
> +  ret double %res
> +}
> +
>
> Propchange: llvm/trunk/test/CodeGen/X86/fma3.ll
>
> ------------------------------------------------------------------------------
>    svn:executable = *
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>



-- 
~Craig
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20120531/80bea1ea/attachment.html>


More information about the llvm-commits mailing list