[llvm] r278431 - X86-FMA3: Implemented commute transformation for EVEX/AVX512 FMA3 opcodes.

Vyacheslav Klochkov via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 11 15:07:34 PDT 2016


Author: v_klochkov
Date: Thu Aug 11 17:07:33 2016
New Revision: 278431

URL: http://llvm.org/viewvc/llvm-project?rev=278431&view=rev
Log:
X86-FMA3: Implemented commute transformation for EVEX/AVX512 FMA3 opcodes.
This helped to improved memory-folding and register coalescing optimizations.

Also, this patch fixed the tracker #17229.

Reviewer: Craig Topper.
Differential Revision: https://reviews.llvm.org/D23108

Added:
    llvm/trunk/lib/Target/X86/X86InstrFMA3Info.cpp   (with props)
    llvm/trunk/lib/Target/X86/X86InstrFMA3Info.h   (with props)
Modified:
    llvm/trunk/lib/Target/X86/CMakeLists.txt
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
    llvm/trunk/lib/Target/X86/X86InstrInfo.h
    llvm/trunk/test/CodeGen/X86/avx512-fma-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512-fma.ll
    llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/fma-fneg-combine.ll

Modified: llvm/trunk/lib/Target/X86/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/CMakeLists.txt?rev=278431&r1=278430&r2=278431&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/X86/CMakeLists.txt Thu Aug 11 17:07:33 2016
@@ -24,6 +24,7 @@ set(sources
   X86FrameLowering.cpp
   X86ISelDAGToDAG.cpp
   X86ISelLowering.cpp
+  X86InstrFMA3Info.cpp
   X86InstrInfo.cpp
   X86MCInstLower.cpp
   X86MachineFunctionInfo.cpp

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=278431&r1=278430&r2=278431&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Thu Aug 11 17:07:33 2016
@@ -194,7 +194,8 @@ multiclass AVX512_maskable_custom<bits<8
                                   list<dag> ZeroMaskingPattern,
                                   string MaskingConstraint = "",
                                   InstrItinClass itin = NoItinerary,
-                                  bit IsCommutable = 0> {
+                                  bit IsCommutable = 0,
+                                  bit IsKCommutable = 0> {
   let isCommutable = IsCommutable in
     def NAME: AVX512<O, F, Outs, Ins,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
@@ -202,7 +203,7 @@ multiclass AVX512_maskable_custom<bits<8
                        Pattern, itin>;
 
   // Prefer over VMOV*rrk Pat<>
-  let AddedComplexity = 20 in
+  let AddedComplexity = 20, isCommutable = IsKCommutable in
     def NAME#k: AVX512<O, F, Outs, MaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
                                      "$dst {${mask}}, "#IntelSrcAsm#"}",
@@ -210,8 +211,11 @@ multiclass AVX512_maskable_custom<bits<8
               EVEX_K {
       // In case of the 3src subclass this is overridden with a let.
       string Constraints = MaskingConstraint;
-  }
-  let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
+    }
+
+  // Zero mask does not add any restrictions to commute operands transformation.
+  // So, it is Ok to use IsCommutable instead of IsKCommutable.
+  let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
     def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
                                      "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
@@ -231,14 +235,16 @@ multiclass AVX512_maskable_common<bits<8
                                   SDNode Select = vselect,
                                   string MaskingConstraint = "",
                                   InstrItinClass itin = NoItinerary,
-                                  bit IsCommutable = 0> :
+                                  bit IsCommutable = 0,
+                                  bit IsKCommutable = 0> :
   AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
                          AttSrcAsm, IntelSrcAsm,
                          [(set _.RC:$dst, RHS)],
                          [(set _.RC:$dst, MaskingRHS)],
                          [(set _.RC:$dst,
                                (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
-                         MaskingConstraint, NoItinerary, IsCommutable>;
+                         MaskingConstraint, NoItinerary, IsCommutable,
+                         IsKCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
@@ -248,13 +254,14 @@ multiclass AVX512_maskable<bits<8> O, Fo
                            string AttSrcAsm, string IntelSrcAsm,
                            dag RHS,
                            InstrItinClass itin = NoItinerary,
-                           bit IsCommutable = 0, SDNode Select = vselect> :
+                           bit IsCommutable = 0, bit IsKCommutable = 0,
+                           SDNode Select = vselect> :
    AVX512_maskable_common<O, F, _, Outs, Ins,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
                           (Select _.KRCWM:$mask, RHS, _.RC:$src0), Select,
-                          "$src0 = $dst", itin, IsCommutable>;
+                          "$src0 = $dst", itin, IsCommutable, IsKCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the scalar instruction.
@@ -278,15 +285,17 @@ multiclass AVX512_maskable_scalar<bits<8
 multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
                                 dag Outs, dag NonTiedIns, string OpcodeStr,
                                 string AttSrcAsm, string IntelSrcAsm,
-                                dag RHS> :
+                                dag RHS, bit IsCommutable = 0, 
+                                bit IsKCommutable = 0> :
    AVX512_maskable_common<O, F, _, Outs,
                           !con((ins _.RC:$src1), NonTiedIns),
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
-                          (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>;
+                          (vselect _.KRCWM:$mask, RHS, _.RC:$src1),
+                          vselect, "", NoItinerary, IsCommutable, IsKCommutable>;
 
-// Similar to AVX512_maskable_3rc but in this case the input VT for the tied
+// Similar to AVX512_maskable_3src but in this case the input VT for the tied
 // operand differs from the output VT. This requires a bitconvert on
 // the preserved vector going into the vselect.
 multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
@@ -305,14 +314,16 @@ multiclass AVX512_maskable_3src_cast<bit
 multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                                      dag Outs, dag NonTiedIns, string OpcodeStr,
                                      string AttSrcAsm, string IntelSrcAsm,
-                                     dag RHS> :
+                                     dag RHS, bit IsCommutable = 0,
+                                     bit IsKCommutable = 0> :
    AVX512_maskable_common<O, F, _, Outs,
                           !con((ins _.RC:$src1), NonTiedIns),
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
                           (X86selects _.KRCWM:$mask, RHS, _.RC:$src1),
-                          X86selects>;
+                          X86selects, "", NoItinerary, IsCommutable,
+                          IsKCommutable>;
 
 multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
                                   dag Outs, dag Ins,
@@ -4842,13 +4853,13 @@ multiclass avx512_fma3p_213_rm<bits<8> o
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3))>,
+          (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
          AVX512FMA3Base;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3)))>,
+          (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
           AVX512FMA3Base;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -4856,7 +4867,7 @@ multiclass avx512_fma3p_213_rm<bits<8> o
             OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
             !strconcat("$src2, ${src3}", _.BroadcastStr ),
             (OpNode _.RC:$src2,
-             _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
+             _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
             AVX512FMA3Base, EVEX_B;
   }
 
@@ -4875,7 +4886,7 @@ multiclass avx512_fma3_213_round<bits<8>
   defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
-          (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc)))>,
+          (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
           AVX512FMA3Base, EVEX_B, EVEX_RC;
 }
 
@@ -4917,13 +4928,13 @@ multiclass avx512_fma3p_231_rm<bits<8> o
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1))>,
+          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
          AVX512FMA3Base;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
+          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
          AVX512FMA3Base;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -4932,7 +4943,7 @@ multiclass avx512_fma3p_231_rm<bits<8> o
          "$src2, ${src3}"##_.BroadcastStr,
          (_.VT (OpNode _.RC:$src2,
                       (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
-                      _.RC:$src1))>, AVX512FMA3Base, EVEX_B;
+                      _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B;
   }
 
   // Additional patterns for folding broadcast nodes in other orders.
@@ -4960,7 +4971,7 @@ multiclass avx512_fma3_231_round<bits<8>
   defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
-          (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc)))>,
+          (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1>,
           AVX512FMA3Base, EVEX_B, EVEX_RC;
 }
 
@@ -6036,7 +6047,7 @@ multiclass avx512_cvtps2ph<X86VectorVTIn
                    (X86cvtps2ph (_src.VT _src.RC:$src1),
                                 (i32 imm:$src2),
                                 (i32 FROUND_CURRENT)),
-                   NoItinerary, 0, X86select>, AVX512AIi8Base;
+                   NoItinerary, 0, 0, X86select>, AVX512AIi8Base;
   def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
              (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
              "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -6056,7 +6067,7 @@ multiclass avx512_cvtps2ph_sae<X86Vector
                    (X86cvtps2ph (_src.VT _src.RC:$src1),
                                 (i32 imm:$src2),
                                 (i32 FROUND_NO_EXC)),
-                   NoItinerary, 0, X86select>, EVEX_B, AVX512AIi8Base;
+                   NoItinerary, 0, 0, X86select>, EVEX_B, AVX512AIi8Base;
 }
 let Predicates = [HasAVX512] in {
   defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,

Added: llvm/trunk/lib/Target/X86/X86InstrFMA3Info.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFMA3Info.cpp?rev=278431&view=auto
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrFMA3Info.cpp (added)
+++ llvm/trunk/lib/Target/X86/X86InstrFMA3Info.cpp Thu Aug 11 17:07:33 2016
@@ -0,0 +1,284 @@
+//===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrFMA3Info.h"
+#include "X86InstrInfo.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Threading.h"
+
+/// This flag is used in the method llvm::call_once() used below to make the
+/// initialization of the map 'OpcodeToGroup' thread safe.
+LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag);
+
+static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj;
+X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() {
+  return &*X86InstrFMA3InfoObj;
+}
+
+void X86InstrFMA3Info::initRMGroup(const uint16_t *RegOpcodes,
+                                   const uint16_t *MemOpcodes, unsigned Attr) {
+  // Create a new instance of this class that would hold a group of FMA opcodes.
+  X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, MemOpcodes, Attr);
+
+  // Add the references from indvidual opcodes to the group holding them.
+  assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
+          !OpcodeToGroup[RegOpcodes[2]] && !OpcodeToGroup[MemOpcodes[0]] &&
+          !OpcodeToGroup[MemOpcodes[1]] && !OpcodeToGroup[MemOpcodes[2]]) &&
+         "Duplication or rewrite of elements in OpcodeToGroup.");
+  OpcodeToGroup[RegOpcodes[0]] = G;
+  OpcodeToGroup[RegOpcodes[1]] = G;
+  OpcodeToGroup[RegOpcodes[2]] = G;
+  OpcodeToGroup[MemOpcodes[0]] = G;
+  OpcodeToGroup[MemOpcodes[1]] = G;
+  OpcodeToGroup[MemOpcodes[2]] = G;
+}
+
+void X86InstrFMA3Info::initRGroup(const uint16_t *RegOpcodes, unsigned Attr) {
+  // Create a new instance of this class that would hold a group of FMA opcodes.
+  X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, nullptr, Attr);
+
+  // Add the references from indvidual opcodes to the group holding them.
+  assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
+          !OpcodeToGroup[RegOpcodes[2]]) &&
+         "Duplication or rewrite of elements in OpcodeToGroup.");
+  OpcodeToGroup[RegOpcodes[0]] = G;
+  OpcodeToGroup[RegOpcodes[1]] = G;
+  OpcodeToGroup[RegOpcodes[2]] = G;
+}
+
+void X86InstrFMA3Info::initMGroup(const uint16_t *MemOpcodes, unsigned Attr) {
+  // Create a new instance of this class that would hold a group of FMA opcodes.
+  X86InstrFMA3Group *G = new X86InstrFMA3Group(nullptr, MemOpcodes, Attr);
+
+  // Add the references from indvidual opcodes to the group holding them.
+  assert((!OpcodeToGroup[MemOpcodes[0]] && !OpcodeToGroup[MemOpcodes[1]] &&
+          !OpcodeToGroup[MemOpcodes[2]]) &&
+         "Duplication or rewrite of elements in OpcodeToGroup.");
+  OpcodeToGroup[MemOpcodes[0]] = G;
+  OpcodeToGroup[MemOpcodes[1]] = G;
+  OpcodeToGroup[MemOpcodes[2]] = G;
+}
+
+#define FMA3RM(R132, R213, R231, M132, M213, M231)                             \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initRMGroup(Reg##R132, Mem##R132);
+
+#define FMA3RMA(R132, R213, R231, M132, M213, M231, Attrs)                     \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initRMGroup(Reg##R132, Mem##R132, (Attrs));
+
+#define FMA3R(R132, R213, R231)                                                \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  initRGroup(Reg##R132);
+
+#define FMA3RA(R132, R213, R231, Attrs)                                        \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  initRGroup(Reg##R132, (Attrs));
+
+#define FMA3M(M132, M213, M231)                                                \
+  static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initMGroup(Mem##M132);
+
+#define FMA3MA(M132, M213, M231, Attrs)                                        \
+  static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initMGroup(Mem##M132, (Attrs));
+
+#define FMA3_AVX2_VECTOR_GROUP(Name)                                           \
+  FMA3RM(Name##132PSr, Name##213PSr, Name##231PSr,                             \
+         Name##132PSm, Name##213PSm, Name##231PSm);                            \
+  FMA3RM(Name##132PDr, Name##213PDr, Name##231PDr,                             \
+         Name##132PDm, Name##213PDm, Name##231PDm);                            \
+  FMA3RM(Name##132PSYr, Name##213PSYr, Name##231PSYr,                          \
+         Name##132PSYm, Name##213PSYm, Name##231PSYm);                         \
+  FMA3RM(Name##132PDYr, Name##213PDYr, Name##231PDYr,                          \
+         Name##132PDYm, Name##213PDYm, Name##231PDYm);
+
+#define FMA3_AVX2_SCALAR_GROUP(Name)                                           \
+  FMA3RM(Name##132SSr, Name##213SSr, Name##231SSr,                             \
+         Name##132SSm, Name##213SSm, Name##231SSm);                            \
+  FMA3RM(Name##132SDr, Name##213SDr, Name##231SDr,                             \
+         Name##132SDm, Name##213SDm, Name##231SDm);                            \
+  FMA3RMA(Name##132SSr_Int, Name##213SSr_Int, Name##231SSr_Int,                \
+          Name##132SSm_Int, Name##213SSm_Int, Name##231SSm_Int,                \
+          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
+  FMA3RMA(Name##132SDr_Int, Name##213SDr_Int, Name##231SDr_Int,                \
+          Name##132SDm_Int, Name##213SDm_Int, Name##231SDm_Int,                \
+          X86InstrFMA3Group::X86FMA3Intrinsic);
+
+#define FMA3_AVX2_FULL_GROUP(Name)                                             \
+  FMA3_AVX2_VECTOR_GROUP(Name);                                                \
+  FMA3_AVX2_SCALAR_GROUP(Name);
+
+#define FMA3_AVX512_VECTOR_GROUP(Name)                                         \
+  FMA3RM(Name##132PSZ128r, Name##213PSZ128r, Name##231PSZ128r,                 \
+         Name##132PSZ128m, Name##213PSZ128m, Name##231PSZ128m);                \
+  FMA3RM(Name##132PDZ128r, Name##213PDZ128r, Name##231PDZ128r,                 \
+         Name##132PDZ128m, Name##213PDZ128m, Name##231PDZ128m);                \
+  FMA3RM(Name##132PSZ256r, Name##213PSZ256r, Name##231PSZ256r,                 \
+         Name##132PSZ256m, Name##213PSZ256m, Name##231PSZ256m);                \
+  FMA3RM(Name##132PDZ256r, Name##213PDZ256r, Name##231PDZ256r,                 \
+         Name##132PDZ256m, Name##213PDZ256m, Name##231PDZ256m);                \
+  FMA3RM(Name##132PSZr,    Name##213PSZr,    Name##231PSZr,                    \
+         Name##132PSZm,    Name##213PSZm,    Name##231PSZm);                   \
+  FMA3RM(Name##132PDZr,    Name##213PDZr,    Name##231PDZr,                    \
+         Name##132PDZm,    Name##213PDZm,    Name##231PDZm);                   \
+  FMA3RMA(Name##132PSZ128rk, Name##213PSZ128rk, Name##231PSZ128rk,             \
+          Name##132PSZ128mk, Name##213PSZ128mk, Name##231PSZ128mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PDZ128rk, Name##213PDZ128rk, Name##231PDZ128rk,             \
+          Name##132PDZ128mk, Name##213PDZ128mk, Name##231PDZ128mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PSZ256rk, Name##213PSZ256rk, Name##231PSZ256rk,             \
+          Name##132PSZ256mk, Name##213PSZ256mk, Name##231PSZ256mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PDZ256rk, Name##213PDZ256rk, Name##231PDZ256rk,             \
+          Name##132PDZ256mk, Name##213PDZ256mk, Name##231PDZ256mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PSZrk,    Name##213PSZrk,    Name##231PSZrk,                \
+          Name##132PSZmk,    Name##213PSZmk,    Name##231PSZmk,                \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PDZrk,    Name##213PDZrk,    Name##231PDZrk,                \
+          Name##132PDZmk,    Name##213PDZmk,    Name##231PDZmk,                \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PSZ128rkz, Name##213PSZ128rkz, Name##231PSZ128rkz,          \
+          Name##132PSZ128mkz, Name##213PSZ128mkz, Name##231PSZ128mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PDZ128rkz, Name##213PDZ128rkz, Name##231PDZ128rkz,          \
+          Name##132PDZ128mkz, Name##213PDZ128mkz, Name##231PDZ128mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PSZ256rkz, Name##213PSZ256rkz, Name##231PSZ256rkz,          \
+          Name##132PSZ256mkz, Name##213PSZ256mkz, Name##231PSZ256mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PDZ256rkz, Name##213PDZ256rkz, Name##231PDZ256rkz,          \
+          Name##132PDZ256mkz, Name##213PDZ256mkz, Name##231PDZ256mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PSZrkz,    Name##213PSZrkz,    Name##231PSZrkz,             \
+          Name##132PSZmkz,    Name##213PSZmkz,    Name##231PSZmkz,             \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PDZrkz,    Name##213PDZrkz,    Name##231PDZrkz,             \
+          Name##132PDZmkz,    Name##213PDZmkz,    Name##231PDZmkz,             \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3R(Name##132PSZrb, Name##213PSZrb, Name##231PSZrb);                       \
+  FMA3R(Name##132PDZrb, Name##213PDZrb, Name##231PDZrb);                       \
+  FMA3RA(Name##132PSZrbk, Name##213PSZrbk, Name##231PSZrbk,                    \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3RA(Name##132PDZrbk, Name##213PDZrbk, Name##231PDZrbk,                    \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3RA(Name##132PSZrbkz, Name##213PSZrbkz, Name##231PSZrbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3RA(Name##132PDZrbkz, Name##213PDZrbkz, Name##231PDZrbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3M(Name##132PSZ128mb, Name##213PSZ128mb, Name##231PSZ128mb);              \
+  FMA3M(Name##132PDZ128mb, Name##213PDZ128mb, Name##231PDZ128mb);              \
+  FMA3M(Name##132PSZ256mb, Name##213PSZ256mb, Name##231PSZ256mb);              \
+  FMA3M(Name##132PDZ256mb, Name##213PDZ256mb, Name##231PDZ256mb);              \
+  FMA3M(Name##132PSZmb, Name##213PSZmb, Name##231PSZmb);                       \
+  FMA3M(Name##132PDZmb, Name##213PDZmb, Name##231PDZmb);                       \
+  FMA3MA(Name##132PSZ128mbk, Name##213PSZ128mbk, Name##231PSZ128mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PDZ128mbk, Name##213PDZ128mbk, Name##231PDZ128mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PSZ256mbk, Name##213PSZ256mbk, Name##231PSZ256mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PDZ256mbk, Name##213PDZ256mbk, Name##231PDZ256mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PSZmbk,    Name##213PSZmbk,    Name##231PSZmbk,              \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PDZmbk,    Name##213PDZmbk,    Name##231PDZmbk,              \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PSZ128mbkz, Name##213PSZ128mbkz, Name##231PSZ128mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PDZ128mbkz, Name##213PDZ128mbkz, Name##231PDZ128mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PSZ256mbkz, Name##213PSZ256mbkz, Name##231PSZ256mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PDZ256mbkz, Name##213PDZ256mbkz, Name##231PDZ256mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PSZmbkz, Name##213PSZmbkz, Name##231PSZmbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PDZmbkz, Name##213PDZmbkz, Name##231PDZmbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);
+
+#define FMA3_AVX512_SCALAR_GROUP(Name)                                         \
+  FMA3RM(Name##132SSZr,      Name##213SSZr,     Name##231SSZr,                 \
+         Name##132SSZm,      Name##213SSZm,     Name##231SSZm);                \
+  FMA3RM(Name##132SDZr,      Name##213SDZr,     Name##231SDZr,                 \
+         Name##132SDZm,      Name##213SDZm,     Name##231SDZm);                \
+  FMA3RMA(Name##132SSZr_Int, Name##213SSZr_Int, Name##231SSZr_Int,             \
+          Name##132SSZm_Int, Name##213SSZm_Int, Name##231SSZm_Int,             \
+          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
+  FMA3RMA(Name##132SDZr_Int, Name##213SDZr_Int, Name##231SDZr_Int,             \
+          Name##132SDZm_Int, Name##213SDZm_Int, Name##231SDZm_Int,             \
+          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
+  FMA3RMA(Name##132SSZr_Intk, Name##213SSZr_Intk, Name##231SSZr_Intk,          \
+          Name##132SSZm_Intk, Name##213SSZm_Intk, Name##231SSZm_Intk,          \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KMergeMasked);                         \
+  FMA3RMA(Name##132SDZr_Intk, Name##213SDZr_Intk, Name##231SDZr_Intk,          \
+          Name##132SDZm_Intk, Name##213SDZm_Intk, Name##231SDZm_Intk,          \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KMergeMasked);                         \
+  FMA3RMA(Name##132SSZr_Intkz, Name##213SSZr_Intkz, Name##231SSZr_Intkz,       \
+          Name##132SSZm_Intkz, Name##213SSZm_Intkz, Name##231SSZm_Intkz,       \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KZeroMasked);                          \
+  FMA3RMA(Name##132SDZr_Intkz, Name##213SDZr_Intkz, Name##231SDZr_Intkz,       \
+          Name##132SDZm_Intkz, Name##213SDZm_Intkz, Name##231SDZm_Intkz,       \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KZeroMasked);                          \
+  FMA3RA(Name##132SSZrb_Int, Name##213SSZrb_Int, Name##231SSZrb_Int,           \
+         X86InstrFMA3Group::X86FMA3Intrinsic);                                 \
+  FMA3RA(Name##132SDZrb_Int, Name##213SDZrb_Int, Name##231SDZrb_Int,           \
+         X86InstrFMA3Group::X86FMA3Intrinsic);                                 \
+  FMA3RA(Name##132SSZrb_Intk, Name##213SSZrb_Intk, Name##231SSZrb_Intk,        \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KMergeMasked);                          \
+  FMA3RA(Name##132SDZrb_Intk, Name##213SDZrb_Intk, Name##231SDZrb_Intk,        \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KMergeMasked);                          \
+  FMA3RA(Name##132SSZrb_Intkz, Name##213SSZrb_Intkz, Name##231SSZrb_Intkz,     \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KZeroMasked);                           \
+  FMA3RA(Name##132SDZrb_Intkz, Name##213SDZrb_Intkz, Name##231SDZrb_Intkz,     \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KZeroMasked);
+
+#define FMA3_AVX512_FULL_GROUP(Name)                                           \
+  FMA3_AVX512_VECTOR_GROUP(Name);                                              \
+  FMA3_AVX512_SCALAR_GROUP(Name);
+
+void X86InstrFMA3Info::initGroupsOnceImpl() {
+  FMA3_AVX2_FULL_GROUP(VFMADD);
+  FMA3_AVX2_FULL_GROUP(VFMSUB);
+  FMA3_AVX2_FULL_GROUP(VFNMADD);
+  FMA3_AVX2_FULL_GROUP(VFNMSUB);
+
+  FMA3_AVX2_VECTOR_GROUP(VFMADDSUB);
+  FMA3_AVX2_VECTOR_GROUP(VFMSUBADD);
+
+  FMA3_AVX512_FULL_GROUP(VFMADD);
+  FMA3_AVX512_FULL_GROUP(VFMSUB);
+  FMA3_AVX512_FULL_GROUP(VFNMADD);
+  FMA3_AVX512_FULL_GROUP(VFNMSUB);
+
+  FMA3_AVX512_VECTOR_GROUP(VFMADDSUB);
+  FMA3_AVX512_VECTOR_GROUP(VFMSUBADD);
+}
+
+void X86InstrFMA3Info::initGroupsOnce() {
+  llvm::call_once(InitGroupsOnceFlag,
+                  []() { getX86InstrFMA3Info()->initGroupsOnceImpl(); });
+}

Propchange: llvm/trunk/lib/Target/X86/X86InstrFMA3Info.cpp
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: llvm/trunk/lib/Target/X86/X86InstrFMA3Info.cpp
------------------------------------------------------------------------------
    svn:keywords = Author Date Id Rev URL

Propchange: llvm/trunk/lib/Target/X86/X86InstrFMA3Info.cpp
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: llvm/trunk/lib/Target/X86/X86InstrFMA3Info.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFMA3Info.h?rev=278431&view=auto
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrFMA3Info.h (added)
+++ llvm/trunk/lib/Target/X86/X86InstrFMA3Info.h Thu Aug 11 17:07:33 2016
@@ -0,0 +1,315 @@
+//===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+
+#include "X86.h"
+#include "llvm/ADT/DenseMap.h"
+#include <cassert>
+#include <set>
+
+using namespace llvm;
+
+/// This class is used to group {132, 213, 231} forms of FMA opcodes together.
+/// Each of the groups has either 3 register opcodes, 3 memory opcodes,
+/// or 6 register and memory opcodes. Also, each group has an attrubutes field
+/// describing it.
+class X86InstrFMA3Group {
+private:
+  /// Reference to an array holding 3 forms of register FMA opcodes.
+  /// It may be set to nullptr if the group of FMA opcodes does not have
+  /// any register form opcodes.
+  const uint16_t *RegOpcodes;
+
+  /// Reference to an array holding 3 forms of memory FMA opcodes.
+  /// It may be set to nullptr if the group of FMA opcodes does not have
+  /// any register form opcodes.
+  const uint16_t *MemOpcodes;
+
+  /// This bitfield specifies the attributes associated with the created
+  /// FMA groups of opcodes.
+  unsigned Attributes;
+
+  static const unsigned Form132 = 0;
+  static const unsigned Form213 = 1;
+  static const unsigned Form231 = 2;
+
+public:
+  /// This bit must be set in the 'Attributes' field of FMA group if such
+  /// group of FMA opcodes consists of FMA intrinsic opcodes.
+  static const unsigned X86FMA3Intrinsic = 0x1;
+
+  /// This bit must be set in the 'Attributes' field of FMA group if such
+  /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and
+  /// passing the elements from the 1st operand to the result of the operation
+  /// when the correpondings bits in the k-mask are unset.
+  static const unsigned X86FMA3KMergeMasked = 0x2;
+
+  /// This bit must be set in the 'Attributes' field of FMA group if such
+  /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask.
+  static const unsigned X86FMA3KZeroMasked = 0x4;
+
+  /// Constructor. Creates a new group of FMA opcodes with three register form
+  /// FMA opcodes \p RegOpcodes and three memory form FMA opcodes \p MemOpcodes.
+  /// The parameters \p RegOpcodes and \p MemOpcodes may be set to nullptr,
+  /// which means that the created group of FMA opcodes does not have the
+  /// corresponding (register or memory) opcodes.
+  /// The parameter \p Attr specifies the attributes describing the created
+  /// group.
+  X86InstrFMA3Group(const uint16_t *RegOpcodes, const uint16_t *MemOpcodes,
+                    unsigned Attr)
+      : RegOpcodes(RegOpcodes), MemOpcodes(MemOpcodes), Attributes(Attr) {
+    assert((RegOpcodes || MemOpcodes) &&
+           "Cannot create a group not having any opcodes.");
+  }
+
+  /// Returns a memory form opcode that is the equivalent of the given register
+  /// form opcode \p RegOpcode. 0 is returned if the group does not have
+  /// either register of memory opcodes.
+  unsigned getMemOpcode(unsigned RegOpcode) const {
+    if (!RegOpcodes || !MemOpcodes)
+      return 0;
+    for (unsigned Form = 0; Form < 3; Form++)
+      if (RegOpcodes[Form] == RegOpcode)
+        return MemOpcodes[Form];
+    return 0;
+  }
+
+  /// Returns the 132 form of FMA register opcode.
+  unsigned getReg132Opcode() const {
+    assert(RegOpcodes && "The group does not have register opcodes.");
+    return RegOpcodes[Form132];
+  }
+
+  /// Returns the 213 form of FMA register opcode.
+  unsigned getReg213Opcode() const {
+    assert(RegOpcodes && "The group does not have register opcodes.");
+    return RegOpcodes[Form213];
+  }
+
+  /// Returns the 231 form of FMA register opcode.
+  unsigned getReg231Opcode() const {
+    assert(RegOpcodes && "The group does not have register opcodes.");
+    return RegOpcodes[Form231];
+  }
+
+  /// Returns the 132 form of FMA memory opcode.
+  unsigned getMem132Opcode() const {
+    assert(MemOpcodes && "The group does not have memory opcodes.");
+    return MemOpcodes[Form132];
+  }
+
+  /// Returns the 213 form of FMA memory opcode.
+  unsigned getMem213Opcode() const {
+    assert(MemOpcodes && "The group does not have memory opcodes.");
+    return MemOpcodes[Form213];
+  }
+
+  /// Returns the 231 form of FMA memory opcode.
+  unsigned getMem231Opcode() const {
+    assert(MemOpcodes && "The group does not have memory opcodes.");
+    return MemOpcodes[Form231];
+  }
+
+  /// Returns true iff the group of FMA opcodes holds intrinsic opcodes.
+  bool isIntrinsic() const { return (Attributes & X86FMA3Intrinsic) != 0; }
+
+  /// Returns true iff the group of FMA opcodes holds k-merge-masked opcodes.
+  bool isKMergeMasked() const {
+    return (Attributes & X86FMA3KMergeMasked) != 0;
+  }
+
+  /// Returns true iff the group of FMA opcodes holds k-zero-masked opcodes.
+  bool isKZeroMasked() const { return (Attributes & X86FMA3KZeroMasked) != 0; }
+
+  /// Returns true iff the group of FMA opcodes holds any of k-masked opcodes.
+  bool isKMasked() const {
+    return (Attributes & (X86FMA3KMergeMasked | X86FMA3KZeroMasked)) != 0;
+  }
+
+  /// Returns true iff the given \p Opcode is a register opcode from the
+  /// groups of FMA opcodes.
+  bool isRegOpcodeFromGroup(unsigned Opcode) const {
+    if (!RegOpcodes)
+      return false;
+    for (unsigned Form = 0; Form < 3; Form++)
+      if (Opcode == RegOpcodes[Form])
+        return true;
+    return false;
+  }
+
+  /// Returns true iff the given \p Opcode is a memory opcode from the
+  /// groups of FMA opcodes.
+  bool isMemOpcodeFromGroup(unsigned Opcode) const {
+    if (!MemOpcodes)
+      return false;
+    for (unsigned Form = 0; Form < 3; Form++)
+      if (Opcode == MemOpcodes[Form])
+        return true;
+    return false;
+  }
+};
+
+/// This class provides information about all existing FMA3 opcodes
+///
+class X86InstrFMA3Info {
+private:
+  /// A map that is used to find the group of FMA opcodes using any FMA opcode
+  /// from the group.
+  DenseMap<unsigned, const X86InstrFMA3Group *> OpcodeToGroup;
+
+  /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
+  /// This method can be called many times, but the actual initialization is
+  /// called only once.
+  static void initGroupsOnce();
+
+  /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
+  /// This method must be called ONLY from initGroupsOnce(). Otherwise, such
+  /// call is not thread safe.
+  void initGroupsOnceImpl();
+
+  /// Creates one group of FMA opcodes having the register opcodes
+  /// \p RegOpcodes and memory opcodes \p MemOpcodes. The parameter \p Attr
+  /// specifies the attributes describing the created group.
+  void initRMGroup(const uint16_t *RegOpcodes,
+                   const uint16_t *MemOpcodes, unsigned Attr = 0);
+
+  /// Creates one group of FMA opcodes having only the register opcodes
+  /// \p RegOpcodes. The parameter \p Attr specifies the attributes describing
+  /// the created group.
+  void initRGroup(const uint16_t *RegOpcodes, unsigned Attr = 0);
+
+  /// Creates one group of FMA opcodes having only the memory opcodes
+  /// \p MemOpcodes. The parameter \p Attr specifies the attributes describing
+  /// the created group.
+  void initMGroup(const uint16_t *MemOpcodes, unsigned Attr = 0);
+
+public:
+  /// Returns the reference to an object of this class. It is assumed that
+  /// only one object may exist.
+  static X86InstrFMA3Info *getX86InstrFMA3Info();
+
+  /// Constructor. Just creates an object of the class.
+  X86InstrFMA3Info() {}
+
+  /// Destructor. Deallocates the memory used for FMA3 Groups.
+  ~X86InstrFMA3Info() {
+    std::set<const X86InstrFMA3Group *> DeletedGroups;
+    auto E = OpcodeToGroup.end();
+    for (auto I = OpcodeToGroup.begin(); I != E; I++) {
+      const X86InstrFMA3Group *G = I->second;
+      if (DeletedGroups.find(G) == DeletedGroups.end()) {
+        DeletedGroups.insert(G);
+        delete G;
+      }
+    }
+  }
+
+  /// Returns a reference to a group of FMA3 opcodes to where the given
+  /// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
+  /// and not included into any FMA3 group, then nullptr is returned.
+  static const X86InstrFMA3Group *getFMA3Group(unsigned Opcode) {
+    // Ensure that the groups of opcodes are initialized.
+    initGroupsOnce();
+
+    // Find the group including the given opcode.
+    const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
+    auto I = FMA3Info->OpcodeToGroup.find(Opcode);
+    if (I == FMA3Info->OpcodeToGroup.end())
+      return nullptr;
+
+    return I->second;
+  }
+
+  /// Returns true iff the given \p Opcode is recognized as FMA3 by this class.
+  static bool isFMA3(unsigned Opcode) {
+    return getFMA3Group(Opcode) != nullptr;
+  }
+
+  /// Iterator that is used to walk on FMA register opcodes having memory
+  /// form equivalents.
+  class rm_iterator {
+  private:
+    /// Iterator associated with the OpcodeToGroup map. It must always be
+    /// initialized with an entry from OpcodeToGroup for which I->first
+    /// points to a register FMA opcode and I->second points to a group of
+    /// FMA opcodes having memory form equivalent of I->first.
+    DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I;
+
+  public:
+    /// Constructor. Creates rm_iterator. The parameter \p I must be an
+    /// iterator to OpcodeToGroup map entry having I->first pointing to
+    /// register form FMA opcode and I->second pointing to a group of FMA
+    /// opcodes holding memory form equivalent for I->fist.
+    rm_iterator(DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I)
+        : I(I) {}
+
+    /// Returns the register form FMA opcode.
+    unsigned getRegOpcode() const { return I->first; };
+
+    /// Returns the memory form equivalent opcode for FMA register opcode
+    /// referenced by I->first.
+    unsigned getMemOpcode() const {
+      unsigned Opcode = I->first;
+      const X86InstrFMA3Group *Group = I->second;
+      return Group->getMemOpcode(Opcode);
+    }
+
+    /// Returns a reference to a group of FMA opcodes.
+    const X86InstrFMA3Group *getGroup() const { return I->second; }
+
+    bool operator==(const rm_iterator &OtherIt) const { return I == OtherIt.I; }
+    bool operator!=(const rm_iterator &OtherIt) const { return I != OtherIt.I; }
+
+    /// Increment. Advances the 'I' iterator to the next OpcodeToGroup entry
+    /// having I->first pointing to register form FMA and I->second pointing
+    /// to a group of FMA opcodes holding memory form equivalen for I->first.
+    rm_iterator &operator++() {
+      auto E = getX86InstrFMA3Info()->OpcodeToGroup.end();
+      for (++I; I != E; ++I) {
+        unsigned RegOpcode = I->first;
+        const X86InstrFMA3Group *Group = I->second;
+        if (Group->getMemOpcode(RegOpcode) != 0)
+          break;
+      }
+      return *this;
+    }
+  };
+
+  /// Returns rm_iterator pointing to the first entry of OpcodeToGroup map
+  /// with a register FMA opcode having memory form opcode equivalent.
+  static rm_iterator rm_begin() {
+    initGroupsOnce();
+    const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
+    auto I = FMA3Info->OpcodeToGroup.begin();
+    auto E = FMA3Info->OpcodeToGroup.end();
+    while (I != E) {
+      unsigned Opcode = I->first;
+      const X86InstrFMA3Group *G = I->second;
+      if (G->getMemOpcode(Opcode) != 0)
+        break;
+      I++;
+    }
+    return rm_iterator(I);
+  }
+
+  /// Returns the last rm_iterator.
+  static rm_iterator rm_end() {
+    initGroupsOnce();
+    return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end());
+  }
+};
+
+#endif

Propchange: llvm/trunk/lib/Target/X86/X86InstrFMA3Info.h
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: llvm/trunk/lib/Target/X86/X86InstrFMA3Info.h
------------------------------------------------------------------------------
    svn:keywords = Author Date Id Rev URL

Propchange: llvm/trunk/lib/Target/X86/X86InstrFMA3Info.h
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=278431&r1=278430&r2=278431&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Thu Aug 11 17:07:33 2016
@@ -1855,281 +1855,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
-    // FMA foldable instructions
-    { X86::VFMADD231SSr,          X86::VFMADD231SSm,          TB_ALIGN_NONE },
-    { X86::VFMADD231SSr_Int,      X86::VFMADD231SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD231SDr,          X86::VFMADD231SDm,          TB_ALIGN_NONE },
-    { X86::VFMADD231SDr_Int,      X86::VFMADD231SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD132SSr,          X86::VFMADD132SSm,          TB_ALIGN_NONE },
-    { X86::VFMADD132SSr_Int,      X86::VFMADD132SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD132SDr,          X86::VFMADD132SDm,          TB_ALIGN_NONE },
-    { X86::VFMADD132SDr_Int,      X86::VFMADD132SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD213SSr,          X86::VFMADD213SSm,          TB_ALIGN_NONE },
-    { X86::VFMADD213SSr_Int,      X86::VFMADD213SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD213SDr,          X86::VFMADD213SDm,          TB_ALIGN_NONE },
-    { X86::VFMADD213SDr_Int,      X86::VFMADD213SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD231SSZr,         X86::VFMADD231SSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD231SSZr_Int,     X86::VFMADD231SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMADD231SDZr,         X86::VFMADD231SDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD231SDZr_Int,     X86::VFMADD231SDZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMADD132SSZr,         X86::VFMADD132SSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD132SSZr_Int,     X86::VFMADD132SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMADD132SDZr,         X86::VFMADD132SDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD132SDZr_Int,     X86::VFMADD132SDZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMADD213SSZr,         X86::VFMADD213SSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD213SSZr_Int,     X86::VFMADD213SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMADD213SDZr,         X86::VFMADD213SDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD213SDZr_Int,     X86::VFMADD213SDZm_Int,     TB_ALIGN_NONE },
-
-    { X86::VFMADD231PSr,          X86::VFMADD231PSm,          TB_ALIGN_NONE },
-    { X86::VFMADD231PDr,          X86::VFMADD231PDm,          TB_ALIGN_NONE },
-    { X86::VFMADD132PSr,          X86::VFMADD132PSm,          TB_ALIGN_NONE },
-    { X86::VFMADD132PDr,          X86::VFMADD132PDm,          TB_ALIGN_NONE },
-    { X86::VFMADD213PSr,          X86::VFMADD213PSm,          TB_ALIGN_NONE },
-    { X86::VFMADD213PDr,          X86::VFMADD213PDm,          TB_ALIGN_NONE },
-    { X86::VFMADD231PSYr,         X86::VFMADD231PSYm,         TB_ALIGN_NONE },
-    { X86::VFMADD231PDYr,         X86::VFMADD231PDYm,         TB_ALIGN_NONE },
-    { X86::VFMADD132PSYr,         X86::VFMADD132PSYm,         TB_ALIGN_NONE },
-    { X86::VFMADD132PDYr,         X86::VFMADD132PDYm,         TB_ALIGN_NONE },
-    { X86::VFMADD213PSYr,         X86::VFMADD213PSYm,         TB_ALIGN_NONE },
-    { X86::VFMADD213PDYr,         X86::VFMADD213PDYm,         TB_ALIGN_NONE },
-    { X86::VFMADD231PSZr,         X86::VFMADD231PSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD231PDZr,         X86::VFMADD231PDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD132PSZr,         X86::VFMADD132PSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD132PDZr,         X86::VFMADD132PDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD213PSZr,         X86::VFMADD213PSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD213PDZr,         X86::VFMADD213PDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD231PSZ128r,      X86::VFMADD231PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD231PDZ128r,      X86::VFMADD231PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD132PSZ128r,      X86::VFMADD132PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD132PDZ128r,      X86::VFMADD132PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD213PSZ128r,      X86::VFMADD213PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD213PDZ128r,      X86::VFMADD213PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD231PSZ256r,      X86::VFMADD231PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMADD231PDZ256r,      X86::VFMADD231PDZ256m,      TB_ALIGN_NONE },
-    { X86::VFMADD132PSZ256r,      X86::VFMADD132PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMADD132PDZ256r,      X86::VFMADD132PDZ256m,      TB_ALIGN_NONE },
-    { X86::VFMADD213PSZ256r,      X86::VFMADD213PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMADD213PDZ256r,      X86::VFMADD213PDZ256m,      TB_ALIGN_NONE },
-
-    { X86::VFNMADD231SSr,         X86::VFNMADD231SSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD231SSr_Int,     X86::VFNMADD231SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD231SDr,         X86::VFNMADD231SDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD231SDr_Int,     X86::VFNMADD231SDm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD132SSr,         X86::VFNMADD132SSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD132SSr_Int,     X86::VFNMADD132SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD132SDr,         X86::VFNMADD132SDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD132SDr_Int,     X86::VFNMADD132SDm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD213SSr,         X86::VFNMADD213SSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD213SSr_Int,     X86::VFNMADD213SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD213SDr,         X86::VFNMADD213SDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD213SDr_Int,     X86::VFNMADD213SDm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD231SSZr,        X86::VFNMADD231SSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231SSZr_Int,    X86::VFNMADD231SSZm_Int,    TB_ALIGN_NONE },
-    { X86::VFNMADD231SDZr,        X86::VFNMADD231SDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231SDZr_Int,    X86::VFNMADD231SDZm_Int,    TB_ALIGN_NONE },
-    { X86::VFNMADD132SSZr,        X86::VFNMADD132SSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132SSZr_Int,    X86::VFNMADD132SSZm_Int,    TB_ALIGN_NONE },
-    { X86::VFNMADD132SDZr,        X86::VFNMADD132SDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132SDZr_Int,    X86::VFNMADD132SDZm_Int,    TB_ALIGN_NONE },
-    { X86::VFNMADD213SSZr,        X86::VFNMADD213SSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213SSZr_Int,    X86::VFNMADD213SSZm_Int,    TB_ALIGN_NONE },
-    { X86::VFNMADD213SDZr,        X86::VFNMADD213SDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213SDZr_Int,    X86::VFNMADD213SDZm_Int,    TB_ALIGN_NONE },
-
-    { X86::VFNMADD231PSr,         X86::VFNMADD231PSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD231PDr,         X86::VFNMADD231PDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD132PSr,         X86::VFNMADD132PSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD132PDr,         X86::VFNMADD132PDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD213PSr,         X86::VFNMADD213PSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD213PDr,         X86::VFNMADD213PDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD231PSYr,        X86::VFNMADD231PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231PDYr,        X86::VFNMADD231PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132PSYr,        X86::VFNMADD132PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132PDYr,        X86::VFNMADD132PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213PSYr,        X86::VFNMADD213PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213PDYr,        X86::VFNMADD213PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231PSZr,        X86::VFNMADD231PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231PDZr,        X86::VFNMADD231PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132PSZr,        X86::VFNMADD132PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132PDZr,        X86::VFNMADD132PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213PSZr,        X86::VFNMADD213PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213PDZr,        X86::VFNMADD213PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231PSZ128r,     X86::VFNMADD231PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD231PDZ128r,     X86::VFNMADD231PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD132PSZ128r,     X86::VFNMADD132PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD132PDZ128r,     X86::VFNMADD132PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD213PSZ128r,     X86::VFNMADD213PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD213PDZ128r,     X86::VFNMADD213PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD231PSZ256r,     X86::VFNMADD231PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMADD231PDZ256r,     X86::VFNMADD231PDZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMADD132PSZ256r,     X86::VFNMADD132PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMADD132PDZ256r,     X86::VFNMADD132PDZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMADD213PSZ256r,     X86::VFNMADD213PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMADD213PDZ256r,     X86::VFNMADD213PDZ256m,     TB_ALIGN_NONE },
-
-    { X86::VFMSUB231SSr,          X86::VFMSUB231SSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB231SSr_Int,      X86::VFMSUB231SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB231SDr,          X86::VFMSUB231SDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB231SDr_Int,      X86::VFMSUB231SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB132SSr,          X86::VFMSUB132SSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB132SSr_Int,      X86::VFMSUB132SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB132SDr,          X86::VFMSUB132SDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB132SDr_Int,      X86::VFMSUB132SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB213SSr,          X86::VFMSUB213SSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB213SSr_Int,      X86::VFMSUB213SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB213SDr,          X86::VFMSUB213SDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB213SDr_Int,      X86::VFMSUB213SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB231SSZr,         X86::VFMSUB231SSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231SSZr_Int,     X86::VFMSUB231SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMSUB231SDZr,         X86::VFMSUB231SDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231SDZr_Int,     X86::VFMSUB231SDZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMSUB132SSZr,         X86::VFMSUB132SSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132SSZr_Int,     X86::VFMSUB132SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMSUB132SDZr,         X86::VFMSUB132SDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132SDZr_Int,     X86::VFMSUB132SDZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMSUB213SSZr,         X86::VFMSUB213SSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213SSZr_Int,     X86::VFMSUB213SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMSUB213SDZr,         X86::VFMSUB213SDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213SDZr_Int,     X86::VFMSUB213SDZm_Int,     TB_ALIGN_NONE },
-
-    { X86::VFMSUB231PSr,          X86::VFMSUB231PSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB231PDr,          X86::VFMSUB231PDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB132PSr,          X86::VFMSUB132PSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB132PDr,          X86::VFMSUB132PDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB213PSr,          X86::VFMSUB213PSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB213PDr,          X86::VFMSUB213PDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB231PSYr,         X86::VFMSUB231PSYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231PDYr,         X86::VFMSUB231PDYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132PSYr,         X86::VFMSUB132PSYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132PDYr,         X86::VFMSUB132PDYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213PSYr,         X86::VFMSUB213PSYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213PDYr,         X86::VFMSUB213PDYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231PSZr,         X86::VFMSUB231PSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231PDZr,         X86::VFMSUB231PDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132PSZr,         X86::VFMSUB132PSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132PDZr,         X86::VFMSUB132PDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213PSZr,         X86::VFMSUB213PSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213PDZr,         X86::VFMSUB213PDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231PSZ128r,      X86::VFMSUB231PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB231PDZ128r,      X86::VFMSUB231PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB132PSZ128r,      X86::VFMSUB132PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB132PDZ128r,      X86::VFMSUB132PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB213PSZ128r,      X86::VFMSUB213PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB213PDZ128r,      X86::VFMSUB213PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB231PSZ256r,      X86::VFMSUB231PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMSUB231PDZ256r,      X86::VFMSUB231PDZ256m,      TB_ALIGN_NONE },
-    { X86::VFMSUB132PSZ256r,      X86::VFMSUB132PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMSUB132PDZ256r,      X86::VFMSUB132PDZ256m,      TB_ALIGN_NONE },
-    { X86::VFMSUB213PSZ256r,      X86::VFMSUB213PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMSUB213PDZ256r,      X86::VFMSUB213PDZ256m,      TB_ALIGN_NONE },
-
-    { X86::VFNMSUB231SSr,         X86::VFNMSUB231SSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB231SSr_Int,     X86::VFNMSUB231SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMSUB231SDr,         X86::VFNMSUB231SDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB231SDr_Int,     X86::VFNMSUB231SDm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132SSr,         X86::VFNMSUB132SSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB132SSr_Int,     X86::VFNMSUB132SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132SDr,         X86::VFNMSUB132SDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB132SDr_Int,     X86::VFNMSUB132SDm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213SSr,         X86::VFNMSUB213SSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB213SSr_Int,     X86::VFNMSUB213SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213SDr,         X86::VFNMSUB213SDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB213SDr_Int,     X86::VFNMSUB213SDm_Int,     TB_ALIGN_NONE },
-
-    { X86::VFNMSUB231PSr,         X86::VFNMSUB231PSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB231PDr,         X86::VFNMSUB231PDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB132PSr,         X86::VFNMSUB132PSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB132PDr,         X86::VFNMSUB132PDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB213PSr,         X86::VFNMSUB213PSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB213PDr,         X86::VFNMSUB213PDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB231PSYr,        X86::VFNMSUB231PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB231PDYr,        X86::VFNMSUB231PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB132PSYr,        X86::VFNMSUB132PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB132PDYr,        X86::VFNMSUB132PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB213PSYr,        X86::VFNMSUB213PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB213PDYr,        X86::VFNMSUB213PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB231PSZr,        X86::VFNMSUB231PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB231PDZr,        X86::VFNMSUB231PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB132PSZr,        X86::VFNMSUB132PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB132PDZr,        X86::VFNMSUB132PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB213PSZr,        X86::VFNMSUB213PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB213PDZr,        X86::VFNMSUB213PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB231PSZ128r,     X86::VFNMSUB231PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB231PDZ128r,     X86::VFNMSUB231PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132PSZ128r,     X86::VFNMSUB132PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132PDZ128r,     X86::VFNMSUB132PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213PSZ128r,     X86::VFNMSUB213PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213PDZ128r,     X86::VFNMSUB213PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB231PSZ256r,     X86::VFNMSUB231PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB231PDZ256r,     X86::VFNMSUB231PDZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132PSZ256r,     X86::VFNMSUB132PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132PDZ256r,     X86::VFNMSUB132PDZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213PSZ256r,     X86::VFNMSUB213PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213PDZ256r,     X86::VFNMSUB213PDZ256m,     TB_ALIGN_NONE },
-
-    { X86::VFMADDSUB231PSr,       X86::VFMADDSUB231PSm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PDr,       X86::VFMADDSUB231PDm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PSr,       X86::VFMADDSUB132PSm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PDr,       X86::VFMADDSUB132PDm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PSr,       X86::VFMADDSUB213PSm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PDr,       X86::VFMADDSUB213PDm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PSYr,      X86::VFMADDSUB231PSYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PDYr,      X86::VFMADDSUB231PDYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PSYr,      X86::VFMADDSUB132PSYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PDYr,      X86::VFMADDSUB132PDYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PSYr,      X86::VFMADDSUB213PSYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PDYr,      X86::VFMADDSUB213PDYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PSZr,      X86::VFMADDSUB231PSZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PDZr,      X86::VFMADDSUB231PDZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PSZr,      X86::VFMADDSUB132PSZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PDZr,      X86::VFMADDSUB132PDZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PSZr,      X86::VFMADDSUB213PSZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PDZr,      X86::VFMADDSUB213PDZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PSZ128r,   X86::VFMADDSUB231PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PDZ128r,   X86::VFMADDSUB231PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PSZ128r,   X86::VFMADDSUB132PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PDZ128r,   X86::VFMADDSUB132PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PSZ128r,   X86::VFMADDSUB213PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PDZ128r,   X86::VFMADDSUB213PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PSZ256r,   X86::VFMADDSUB231PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PDZ256r,   X86::VFMADDSUB231PDZ256m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PSZ256r,   X86::VFMADDSUB132PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PDZ256r,   X86::VFMADDSUB132PDZ256m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PSZ256r,   X86::VFMADDSUB213PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PDZ256r,   X86::VFMADDSUB213PDZ256m,   TB_ALIGN_NONE },
-
-    { X86::VFMSUBADD231PSr,       X86::VFMSUBADD231PSm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PDr,       X86::VFMSUBADD231PDm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PSr,       X86::VFMSUBADD132PSm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PDr,       X86::VFMSUBADD132PDm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PSr,       X86::VFMSUBADD213PSm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PDr,       X86::VFMSUBADD213PDm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PSYr,      X86::VFMSUBADD231PSYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PDYr,      X86::VFMSUBADD231PDYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PSYr,      X86::VFMSUBADD132PSYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PDYr,      X86::VFMSUBADD132PDYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PSYr,      X86::VFMSUBADD213PSYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PDYr,      X86::VFMSUBADD213PDYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PSZr,      X86::VFMSUBADD231PSZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PDZr,      X86::VFMSUBADD231PDZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PSZr,      X86::VFMSUBADD132PSZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PDZr,      X86::VFMSUBADD132PDZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PSZr,      X86::VFMSUBADD213PSZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PDZr,      X86::VFMSUBADD213PDZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PSZ128r,   X86::VFMSUBADD231PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PDZ128r,   X86::VFMSUBADD231PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PSZ128r,   X86::VFMSUBADD132PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PDZ128r,   X86::VFMSUBADD132PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PSZ128r,   X86::VFMSUBADD213PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PDZ128r,   X86::VFMSUBADD213PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PSZ256r,   X86::VFMSUBADD231PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PDZ256r,   X86::VFMSUBADD231PDZ256m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PSZ256r,   X86::VFMSUBADD132PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PDZ256r,   X86::VFMSUBADD132PDZ256m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PSZ256r,   X86::VFMSUBADD213PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PDZ256r,   X86::VFMSUBADD213PDZ256m,   TB_ALIGN_NONE },
-
     // FMA4 foldable patterns
     { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           TB_ALIGN_NONE },
     { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           TB_ALIGN_NONE },
@@ -2234,6 +1959,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
                   // Index 3, folded load
                   Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
   }
+  auto I = X86InstrFMA3Info::rm_begin();
+  auto E = X86InstrFMA3Info::rm_end();
+  for (; I != E; ++I)
+    if (!I.getGroup()->isKMasked())
+      AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+                    I.getRegOpcode(), I.getMemOpcode(),
+                    TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
 
   static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
      // AVX-512 foldable instructions
@@ -2283,6 +2015,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
                   // Index 4, folded load
                   Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
   }
+  for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I)
+    if (I.getGroup()->isKMasked())
+      AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+                    I.getRegOpcode(), I.getMemOpcode(),
+                    TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
 }
 
 void
@@ -3345,241 +3082,11 @@ X86InstrInfo::convertToThreeAddress(Mach
   return NewMI;
 }
 
-/// Returns true if the given instruction opcode is FMA3.
-/// Otherwise, returns false.
-/// The second parameter is optional and is used as the second return from
-/// the function. It is set to true if the given instruction has FMA3 opcode
-/// that is used for lowering of scalar FMA intrinsics, and it is set to false
-/// otherwise.
-static bool isFMA3(unsigned Opcode, bool &IsIntrinsic) {
-  IsIntrinsic = false;
-
-#define FMA3_CASE(Name, Modifier) \
-case X86::Name##r##Modifier: case X86::Name##m##Modifier:
-
-#define FMA3_SCALAR_PAIR(Name, Size, Modifier) \
-  FMA3_CASE(Name##SD##Size, Modifier) \
-  FMA3_CASE(Name##SS##Size, Modifier)
-
-#define FMA3_PACKED_PAIR(Name, Size) \
-  FMA3_CASE(Name##PD##Size, ) \
-  FMA3_CASE(Name##PS##Size, )
-
-#define FMA3_PACKED_SET(Form, Size) \
-  FMA3_PACKED_PAIR(VFMADD##Form, Size) \
-  FMA3_PACKED_PAIR(VFMSUB##Form, Size) \
-  FMA3_PACKED_PAIR(VFNMADD##Form, Size) \
-  FMA3_PACKED_PAIR(VFNMSUB##Form, Size) \
-  FMA3_PACKED_PAIR(VFMADDSUB##Form, Size) \
-  FMA3_PACKED_PAIR(VFMSUBADD##Form, Size)
-
-#define FMA3_CASES(Form) \
-  FMA3_SCALAR_PAIR(VFMADD##Form, ,) \
-  FMA3_SCALAR_PAIR(VFMSUB##Form, ,) \
-  FMA3_SCALAR_PAIR(VFNMADD##Form, ,) \
-  FMA3_SCALAR_PAIR(VFNMSUB##Form, ,) \
-  FMA3_PACKED_SET(Form, ) \
-  FMA3_PACKED_SET(Form, Y) \
-
-#define FMA3_CASES_AVX512(Form) \
-  FMA3_SCALAR_PAIR(VFMADD##Form, Z, ) \
-  FMA3_SCALAR_PAIR(VFMSUB##Form, Z, ) \
-  FMA3_SCALAR_PAIR(VFNMADD##Form, Z, ) \
-  FMA3_SCALAR_PAIR(VFNMSUB##Form, Z, ) \
-  FMA3_PACKED_SET(Form, Z128) \
-  FMA3_PACKED_SET(Form, Z256) \
-  FMA3_PACKED_SET(Form, Z)
-
-#define FMA3_CASES_SCALAR_INT(Form) \
-  FMA3_SCALAR_PAIR(VFMADD##Form, , _Int) \
-  FMA3_SCALAR_PAIR(VFMSUB##Form, , _Int) \
-  FMA3_SCALAR_PAIR(VFNMADD##Form, , _Int) \
-  FMA3_SCALAR_PAIR(VFNMSUB##Form, , _Int)
-
-#define FMA3_CASES_SCALAR_INT_AVX512(Form) \
-  FMA3_SCALAR_PAIR(VFMADD##Form, Z, _Int) \
-  FMA3_SCALAR_PAIR(VFMSUB##Form, Z, _Int) \
-  FMA3_SCALAR_PAIR(VFNMADD##Form, Z, _Int) \
-  FMA3_SCALAR_PAIR(VFNMSUB##Form, Z, _Int)
-
-  switch (Opcode) {
-  FMA3_CASES(132)
-  FMA3_CASES(213)
-  FMA3_CASES(231)
-
-  // AVX-512 instructions
-  FMA3_CASES_AVX512(132)
-  FMA3_CASES_AVX512(213)
-  FMA3_CASES_AVX512(231)
-    return true;
-
-  FMA3_CASES_SCALAR_INT(132)
-  FMA3_CASES_SCALAR_INT(213)
-  FMA3_CASES_SCALAR_INT(231)
-
-  // AVX-512 instructions
-  FMA3_CASES_SCALAR_INT_AVX512(132)
-  FMA3_CASES_SCALAR_INT_AVX512(213)
-  FMA3_CASES_SCALAR_INT_AVX512(231)
-    IsIntrinsic = true;
-    return true;
-  default:
-    return false;
-  }
-  llvm_unreachable("Opcode not handled by the switch");
-
-#undef FMA3_CASE
-#undef FMA3_SCALAR_PAIR
-#undef FMA3_PACKED_PAIR
-#undef FMA3_PACKED_SET
-#undef FMA3_CASES
-#undef FMA3_CASES_AVX512
-#undef FMA3_CASES_SCALAR_INT
-#undef FMA3_CASES_SCALAR_INT_AVX512
-}
-
-/// Returns an adjusted FMA opcode that must be used in FMA instruction that
-/// performs the same computations as the given MI but which has the operands
-/// \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
-/// It may return 0 if it is unsafe to commute the operands.
-///
-/// The returned FMA opcode may differ from the opcode in the given \p MI.
-/// For example, commuting the operands #1 and #3 in the following FMA
-///     FMA213 #1, #2, #3
-/// results into instruction with adjusted opcode:
-///     FMA231 #3, #2, #1
-static unsigned getFMA3OpcodeToCommuteOperands(unsigned Opc,
-                                               bool IsIntrinOpcode,
-                                               unsigned SrcOpIdx1,
-                                               unsigned SrcOpIdx2) {
-#define FMA3_ENTRY(Name, Suffix) \
-  { X86::Name##132##Suffix, X86::Name##213##Suffix, X86::Name##231##Suffix },
-
-#define FMA3_SCALAR_PAIR(Name, Suffix) \
-  FMA3_ENTRY(Name, SS##Suffix) \
-  FMA3_ENTRY(Name, SD##Suffix)
-
-#define FMA3_PACKED_PAIR(Name, Suffix) \
-  FMA3_ENTRY(Name, PS##Suffix) \
-  FMA3_ENTRY(Name, PD##Suffix)
-
-#define FMA3_PACKED_SIZES(Name, Suffix) \
-  FMA3_PACKED_PAIR(Name, Suffix) \
-  FMA3_PACKED_PAIR(Name, Y##Suffix)
-
-#define FMA3_TABLE_ALL(Name) \
-  FMA3_SCALAR_PAIR(Name, r) \
-  FMA3_PACKED_SIZES(Name, r) \
-  FMA3_SCALAR_PAIR(Name, m) \
-  FMA3_PACKED_SIZES(Name, m)
-
-#define FMA3_TABLE_PACKED(Name) \
-  FMA3_PACKED_SIZES(Name, r) \
-  FMA3_PACKED_SIZES(Name, m)
-
-#define FMA3_TABLE_SCALAR_INT(Name) \
-  FMA3_SCALAR_PAIR(Name, r_Int) \
-  FMA3_SCALAR_PAIR(Name, m_Int)
-
-#define FMA3_PACKED_SIZES_AVX512(Name, Suffix) \
-  FMA3_PACKED_PAIR(Name, Z128##Suffix) \
-  FMA3_PACKED_PAIR(Name, Z256##Suffix) \
-  FMA3_PACKED_PAIR(Name, Z##Suffix)
-
-#define FMA3_TABLE_ALL_AVX512(Name) \
-  FMA3_SCALAR_PAIR(Name, Zr) \
-  FMA3_PACKED_SIZES_AVX512(Name, r) \
-  FMA3_SCALAR_PAIR(Name, Zm) \
-  FMA3_PACKED_SIZES_AVX512(Name, m)
-
-#define FMA3_TABLE_PACKED_AVX512(Name) \
-  FMA3_PACKED_SIZES_AVX512(Name, r) \
-  FMA3_PACKED_SIZES_AVX512(Name, m)
-
-#define FMA3_TABLE_SCALAR_INT_AVX512(Name) \
-  FMA3_SCALAR_PAIR(Name, Zr_Int) \
-  FMA3_SCALAR_PAIR(Name, Zm_Int)
-
-  // Define the array that holds FMA opcodes in groups
-  // of 3 opcodes(132, 213, 231) in each group.
-  static const uint16_t RegularOpcodeGroups[][3] = {
-    FMA3_TABLE_ALL(VFMADD)
-    FMA3_TABLE_ALL(VFMSUB)
-    FMA3_TABLE_ALL(VFNMADD)
-    FMA3_TABLE_ALL(VFNMSUB)
-    FMA3_TABLE_PACKED(VFMADDSUB)
-    FMA3_TABLE_PACKED(VFMSUBADD)
-
-    // AVX-512 instructions
-    FMA3_TABLE_ALL_AVX512(VFMADD)
-    FMA3_TABLE_ALL_AVX512(VFMSUB)
-    FMA3_TABLE_ALL_AVX512(VFNMADD)
-    FMA3_TABLE_ALL_AVX512(VFNMSUB)
-    FMA3_TABLE_PACKED_AVX512(VFMADDSUB)
-    FMA3_TABLE_PACKED_AVX512(VFMSUBADD)
-  };
-
-  // Define the array that holds FMA*_Int opcodes in groups
-  // of 3 opcodes(132, 213, 231) in each group.
-  static const uint16_t IntrinOpcodeGroups[][3] = {
-    FMA3_TABLE_SCALAR_INT(VFMADD)
-    FMA3_TABLE_SCALAR_INT(VFMSUB)
-    FMA3_TABLE_SCALAR_INT(VFNMADD)
-    FMA3_TABLE_SCALAR_INT(VFNMSUB)
-
-    // AVX-512 instructions
-    FMA3_TABLE_SCALAR_INT_AVX512(VFMADD)
-    FMA3_TABLE_SCALAR_INT_AVX512(VFMSUB)
-    FMA3_TABLE_SCALAR_INT_AVX512(VFNMADD)
-    FMA3_TABLE_SCALAR_INT_AVX512(VFNMSUB)
-  };
-
-#undef FMA3_ENTRY
-#undef FMA3_SCALAR_PAIR
-#undef FMA3_PACKED_PAIR
-#undef FMA3_PACKED_SIZES
-#undef FMA3_TABLE_ALL
-#undef FMA3_TABLE_PACKED
-#undef FMA3_TABLE_SCALAR_INT
-#undef FMA3_SCALAR_PAIR_AVX512
-#undef FMA3_PACKED_SIZES_AVX512
-#undef FMA3_TABLE_ALL_AVX512
-#undef FMA3_TABLE_PACKED_AVX512
-#undef FMA3_TABLE_SCALAR_INT_AVX512
-
-  const unsigned Form132Index = 0;
-  const unsigned Form213Index = 1;
-  const unsigned Form231Index = 2;
-  const unsigned FormsNum = 3;
-
-  size_t GroupsNum;
-  const uint16_t (*OpcodeGroups)[3];
-  if (IsIntrinOpcode) {
-    GroupsNum = array_lengthof(IntrinOpcodeGroups);
-    OpcodeGroups = IntrinOpcodeGroups;
-  } else {
-    GroupsNum = array_lengthof(RegularOpcodeGroups);
-    OpcodeGroups = RegularOpcodeGroups;
-  }
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
+    const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
+    const X86InstrFMA3Group &FMA3Group) const {
 
-  const uint16_t *FoundOpcodesGroup = nullptr;
-  size_t FormIndex;
-
-  // Look for the input opcode in the corresponding opcodes table.
-  for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
-         ++GroupIndex) {
-    for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
-      if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
-        FoundOpcodesGroup = OpcodeGroups[GroupIndex];
-        break;
-      }
-    }
-  }
-
-  // The input opcode does not match with any of the opcodes from the tables.
-  // The unsupported FMA opcode must be added to one of the two opcode groups
-  // defined above.
-  assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode");
+  unsigned Opc = MI.getOpcode();
 
   // Put the lowest index to SrcOpIdx1 to simplify the checks below.
   if (SrcOpIdx1 > SrcOpIdx2)
@@ -3591,15 +3098,40 @@ static unsigned getFMA3OpcodeToCommuteOp
   // not implemented yet. So, just return 0 in that case.
   // When such analysis are available this place will be the right place for
   // calling it.
-  if (IsIntrinOpcode && SrcOpIdx1 == 1)
+  if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
     return 0;
 
+  unsigned FMAOp1 = 1, FMAOp2 = 2, FMAOp3 = 3;
+  if (FMA3Group.isKMasked()) {
+    // The k-mask operand cannot be commuted.
+    if (SrcOpIdx1 == 2)
+      return 0;
+
+    // For k-zero-masked operations it is Ok to commute the first vector
+    // operand.
+    // For regular k-masked operations a conservative choice is done as the
+    // elements of the first vector operand, for which the corresponding bit
+    // in the k-mask operand is set to 0, are copied to the result of FMA.
+    // TODO/FIXME: The commute still may be legal if it is known that the
+    // k-mask operand is set to either all ones or all zeroes.
+    // It is also Ok to commute the 1st operand if all users of MI use only
+    // the elements enabled by the k-mask operand. For example,
+    //   v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
+    //                                                     : v1[i];
+    //   VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
+    //                                  // Ok, to commute v1 in FMADD213PSZrk.
+    if (FMA3Group.isKMergeMasked() && SrcOpIdx1 == FMAOp1)
+      return 0;
+    FMAOp2++;
+    FMAOp3++;
+  }
+
   unsigned Case;
-  if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
+  if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp2)
     Case = 0;
-  else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
+  else if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp3)
     Case = 1;
-  else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
+  else if (SrcOpIdx1 == FMAOp2 && SrcOpIdx2 == FMAOp3)
     Case = 2;
   else
     return 0;
@@ -3607,6 +3139,9 @@ static unsigned getFMA3OpcodeToCommuteOp
   // Define the FMA forms mapping array that helps to map input FMA form
   // to output FMA form to preserve the operation semantics after
   // commuting the operands.
+  const unsigned Form132Index = 0;
+  const unsigned Form213Index = 1;
+  const unsigned Form231Index = 2;
   static const unsigned FormMapping[][3] = {
     // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
     // FMA132 A, C, b; ==> FMA231 C, A, b;
@@ -3625,9 +3160,24 @@ static unsigned getFMA3OpcodeToCommuteOp
     { Form213Index, Form132Index, Form231Index }
   };
 
+  unsigned FMAForms[3];
+  if (FMA3Group.isRegOpcodeFromGroup(Opc)) {
+    FMAForms[0] = FMA3Group.getReg132Opcode();
+    FMAForms[1] = FMA3Group.getReg213Opcode();
+    FMAForms[2] = FMA3Group.getReg231Opcode();
+  } else {
+    FMAForms[0] = FMA3Group.getMem132Opcode();
+    FMAForms[1] = FMA3Group.getMem213Opcode();
+    FMAForms[2] = FMA3Group.getMem231Opcode();
+  }
+  unsigned FormIndex;
+  for (FormIndex = 0; FormIndex < 3; FormIndex++)
+    if (Opc == FMAForms[FormIndex])
+      break;
+
   // Everything is ready, just adjust the FMA opcode and return it.
   FormIndex = FormMapping[Case][FormIndex];
-  return FoundOpcodesGroup[FormIndex];
+  return FMAForms[FormIndex];
 }
 
 MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
@@ -3852,11 +3402,11 @@ MachineInstr *X86InstrInfo::commuteInstr
                                                    OpIdx1, OpIdx2);
   }
   default:
-    bool IsIntrinOpcode;
-    if (isFMA3(MI.getOpcode(), IsIntrinOpcode)) {
-      unsigned Opc = getFMA3OpcodeToCommuteOperands(MI.getOpcode(),
-                                                    IsIntrinOpcode,
-                                                    OpIdx1, OpIdx2);
+    const X86InstrFMA3Group *FMA3Group =
+        X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+    if (FMA3Group) {
+      unsigned Opc =
+        getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
       if (Opc == 0)
         return nullptr;
       auto &WorkingMI = cloneIfNew(MI);
@@ -3869,21 +3419,37 @@ MachineInstr *X86InstrInfo::commuteInstr
   }
 }
 
-bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
-                                             bool IsIntrinOpcode,
-                                             unsigned &SrcOpIdx1,
-                                             unsigned &SrcOpIdx2) const {
+bool X86InstrInfo::findFMA3CommutedOpIndices(
+    const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
+    const X86InstrFMA3Group &FMA3Group) const {
+  unsigned FirstCommutableVecOp = 1;
+  unsigned LastCommutableVecOp = 3;
+  unsigned KMaskOp = 0;
+  if (FMA3Group.isKMasked()) {
+    // The k-mask operand has index = 2 for masked and zero-masked operations.
+    KMaskOp = 2;
+
+    // The operand with index = 1 is used as a source for those elements for
+    // which the corresponding bit in the k-mask is set to 0.
+    if (FMA3Group.isKMergeMasked())
+      FirstCommutableVecOp = 3;
+
+    LastCommutableVecOp++;
+  }
 
-  unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
+  if (isMem(MI, LastCommutableVecOp))
+    LastCommutableVecOp--;
 
   // Only the first RegOpsNum operands are commutable.
   // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
   // that the operand is not specified/fixed.
   if (SrcOpIdx1 != CommuteAnyOperandIndex &&
-      (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum))
+      (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
+       SrcOpIdx1 == KMaskOp))
     return false;
   if (SrcOpIdx2 != CommuteAnyOperandIndex &&
-      (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum))
+      (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
+       SrcOpIdx2 == KMaskOp))
     return false;
 
   // Look for two different register operands assumed to be commutable
@@ -3898,7 +3464,7 @@ bool X86InstrInfo::findFMA3CommutedOpInd
     if (SrcOpIdx1 == SrcOpIdx2)
       // Both of operands are not fixed. By default set one of commutable
       // operands to the last register operand of the instruction.
-      CommutableOpIdx2 = RegOpsNum;
+      CommutableOpIdx2 = LastCommutableVecOp;
     else if (SrcOpIdx2 == CommuteAnyOperandIndex)
       // Only one of operands is not fixed.
       CommutableOpIdx2 = SrcOpIdx1;
@@ -3906,7 +3472,12 @@ bool X86InstrInfo::findFMA3CommutedOpInd
     // CommutableOpIdx2 is well defined now. Let's choose another commutable
     // operand and assign its index to CommutableOpIdx1.
     unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
-    for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
+    for (CommutableOpIdx1 = LastCommutableVecOp;
+         CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
+      // Just ignore and skip the k-mask operand.
+      if (CommutableOpIdx1 == KMaskOp)
+        continue;
+
       // The commuted operands must have different registers.
       // Otherwise, the commute transformation does not change anything and
       // is useless then.
@@ -3915,7 +3486,7 @@ bool X86InstrInfo::findFMA3CommutedOpInd
     }
 
     // No appropriate commutable operands were found.
-    if (CommutableOpIdx1 == 0)
+    if (CommutableOpIdx1 < FirstCommutableVecOp)
       return false;
 
     // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
@@ -3927,8 +3498,7 @@ bool X86InstrInfo::findFMA3CommutedOpInd
 
   // Check if we can adjust the opcode to preserve the semantics when
   // commute the register operands.
-  return getFMA3OpcodeToCommuteOperands(MI.getOpcode(), IsIntrinOpcode,
-                                        SrcOpIdx1, SrcOpIdx2) != 0;
+  return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
 }
 
 bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
@@ -3955,10 +3525,10 @@ bool X86InstrInfo::findCommutedOpIndices
     return false;
   }
   default:
-    bool IsIntrinOpcode;
-    if (isFMA3(MI.getOpcode(), IsIntrinOpcode))
-      return findFMA3CommutedOpIndices(MI, IsIntrinOpcode,
-                                       SrcOpIdx1, SrcOpIdx2);
+    const X86InstrFMA3Group *FMA3Group =
+        X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+    if (FMA3Group)
+      return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group);
     return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
   }
   return false;

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.h?rev=278431&r1=278430&r2=278431&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.h (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.h Thu Aug 11 17:07:33 2016
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
 
 #include "MCTargetDesc/X86BaseInfo.h"
+#include "X86InstrFMA3Info.h"
 #include "X86RegisterInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -265,7 +266,7 @@ public:
                              unsigned &SrcOpIdx2) const override;
 
   /// Returns true if the routine could find two commutable operands
-  /// in the given FMA instruction. Otherwise, returns false.
+  /// in the given FMA instruction \p MI. Otherwise, returns false.
   ///
   /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
   /// The output indices of the commuted operands are returned in these
@@ -274,10 +275,12 @@ public:
   /// value 'CommuteAnyOperandIndex' which means that the corresponding
   /// operand index is not set and this method is free to pick any of
   /// available commutable operands.
+  /// The parameter \p FMA3Group keeps the reference to the group of relative
+  /// FMA3 opcodes including register/memory forms of 132/213/231 opcodes.
   ///
   /// For example, calling this method this way:
   ///     unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
-  ///     findFMA3CommutedOpIndices(MI, Idx1, Idx2);
+  ///     findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group);
   /// can be interpreted as a query asking if the operand #1 can be swapped
   /// with any other available operand (e.g. operand #2, operand #3, etc.).
   ///
@@ -286,9 +289,30 @@ public:
   ///     FMA213 #1, #2, #3
   /// results into instruction with adjusted opcode:
   ///     FMA231 #3, #2, #1
-  bool findFMA3CommutedOpIndices(MachineInstr &MI, bool IsIntrinOpcode,
+  bool findFMA3CommutedOpIndices(const MachineInstr &MI,
                                  unsigned &SrcOpIdx1,
-                                 unsigned &SrcOpIdx2) const;
+                                 unsigned &SrcOpIdx2,
+                                 const X86InstrFMA3Group &FMA3Group) const;
+
+  /// Returns an adjusted FMA opcode that must be used in FMA instruction that
+  /// performs the same computations as the given \p MI but which has the
+  /// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
+  /// It may return 0 if it is unsafe to commute the operands.
+  /// Note that a machine instruction (instead of its opcode) is passed as the
+  /// first parameter to make it possible to analyze the instruction's uses and
+  /// commute the first operand of FMA even when it seems unsafe when you look
+  /// at the opcode. For example, it is Ok to commute the first operand of
+  /// VFMADD*SD_Int, if ONLY the lowest 64-bit element of the result is used.
+  ///
+  /// The returned FMA opcode may differ from the opcode in the given \p MI.
+  /// For example, commuting the operands #1 and #3 in the following FMA
+  ///     FMA213 #1, #2, #3
+  /// results into instruction with adjusted opcode:
+  ///     FMA231 #3, #2, #1
+  unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI,
+                                          unsigned SrcOpIdx1,
+                                          unsigned SrcOpIdx2,
+                                          const X86InstrFMA3Group &FMA3Group) const;
 
   // Branch analysis.
   bool isUnpredicatedTerminator(const MachineInstr &MI) const override;

Modified: llvm/trunk/test/CodeGen/X86/avx512-fma-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-fma-intrinsics.ll?rev=278431&r1=278430&r2=278431&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-fma-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-fma-intrinsics.ll Thu Aug 11 17:07:33 2016
@@ -310,8 +310,7 @@ define <16 x float> @test_mask_round_vfm
 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind
   ret <16 x float> %res
@@ -320,8 +319,7 @@ define <16 x float> @test_mask_round_vfm
 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps {rd-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind
   ret <16 x float> %res
@@ -330,8 +328,7 @@ define <16 x float> @test_mask_round_vfm
 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind
   ret <16 x float> %res
@@ -340,8 +337,7 @@ define <16 x float> @test_mask_round_vfm
 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps {rz-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind
   ret <16 x float> %res
@@ -443,8 +439,7 @@ define <8 x double> @test_mask_round_vfm
 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
   ret <8 x double> %res
@@ -453,8 +448,7 @@ define <8 x double> @test_mask_round_vfm
 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213pd {rd-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
   ret <8 x double> %res
@@ -463,8 +457,7 @@ define <8 x double> @test_mask_round_vfm
 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213pd {ru-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
   ret <8 x double> %res
@@ -473,8 +466,7 @@ define <8 x double> @test_mask_round_vfm
 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213pd {rz-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
   ret <8 x double> %res
@@ -641,8 +633,7 @@ define <8 x double> @test_mask_round_vfn
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
   ret <8 x double> %res
@@ -651,8 +642,7 @@ define <8 x double> @test_mask_round_vfn
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfnmsub213pd {rd-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
   ret <8 x double> %res
@@ -661,8 +651,7 @@ define <8 x double> @test_mask_round_vfn
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfnmsub213pd {ru-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
   ret <8 x double> %res
@@ -671,8 +660,7 @@ define <8 x double> @test_mask_round_vfn
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfnmsub213pd {rz-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
   ret <8 x double> %res

Modified: llvm/trunk/test/CodeGen/X86/avx512-fma.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-fma.ll?rev=278431&r1=278430&r2=278431&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-fma.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-fma.ll Thu Aug 11 17:07:33 2016
@@ -101,8 +101,7 @@ define double @test_x86_fmsub_231_m(doub
 define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
 ; ALL-LABEL: test231_br:
 ; ALL:       ## BB#0:
-; ALL-NEXT:    vfmadd231ps {{.*}}(%rip){1to16}, %zmm0, %zmm1
-; ALL-NEXT:    vmovaps %zmm1, %zmm0
+; ALL-NEXT:    vfmadd132ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
   %b2 = fadd <16 x float> %b1, %a2
@@ -112,8 +111,7 @@ define <16 x float> @test231_br(<16 x fl
 define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
 ; ALL-LABEL: test213_br:
 ; ALL:       ## BB#0:
-; ALL-NEXT:    vfmadd213ps {{.*}}(%rip){1to16}, %zmm0, %zmm1
-; ALL-NEXT:    vmovaps %zmm1, %zmm0
+; ALL-NEXT:    vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %b1 = fmul <16 x float> %a1, %a2
   %b2 = fadd <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
@@ -175,8 +173,7 @@ define <16 x float> @test_x86_fmadd213_p
 ; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
 ; KNL-NEXT:    vpslld $31, %zmm2, %zmm2
 ; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; KNL-NEXT:    vmovups (%rdi), %zmm2
-; KNL-NEXT:    vfmadd132ps %zmm0, %zmm2, %zmm1 {%k1}
+; KNL-NEXT:    vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
 ; KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; KNL-NEXT:    retq
 ;
@@ -184,8 +181,7 @@ define <16 x float> @test_x86_fmadd213_p
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpsllw $7, %xmm2, %xmm2
 ; SKX-NEXT:    vpmovb2m %xmm2, %k1
-; SKX-NEXT:    vmovups (%rdi), %zmm2
-; SKX-NEXT:    vfmadd132ps %zmm0, %zmm2, %zmm1 {%k1}
+; SKX-NEXT:    vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %a2   = load <16 x float>,<16 x float> *%a2_ptrt,align 1

Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll?rev=278431&r1=278430&r2=278431&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll Thu Aug 11 17:07:33 2016
@@ -1812,8 +1812,7 @@ define <4 x float> @test_mask_vfmadd128_
 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vmovaps (%rdi), %xmm2 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0x17]
-; CHECK-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
+; CHECK-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %a2 = load <4 x float>, <4 x float>* %ptr_a2
   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
@@ -1824,8 +1823,7 @@ define <4 x float> @test_mask_vfmadd128_
 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vmovups (%rdi), %xmm2 ## encoding: [0x62,0xf1,0x7c,0x08,0x10,0x17]
-; CHECK-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
+; CHECK-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8
   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
@@ -1885,8 +1883,7 @@ define <4 x float> @test_mask_vfmadd128_
 define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x18,0xa8,0x0f]
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load float, float* %ptr_a2
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@@ -1900,8 +1897,7 @@ define <4 x float> @test_mask_vfmadd128_
 define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x18,0xa8,0x0f]
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load float, float* %ptr_a2, align 4
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@@ -1935,8 +1931,7 @@ define <2 x double> @test_mask_vfmadd128
 ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vmovapd (%rdi), %xmm2 ## encoding: [0x62,0xf1,0xfd,0x08,0x28,0x17]
-; CHECK-NEXT:    vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
+; CHECK-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %a2 = load <2 x double>, <2 x double>* %ptr_a2
   %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
@@ -1976,8 +1971,7 @@ define <4 x double> @test_mask_vfmadd256
 ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vmovapd (%rdi), %ymm2 ## encoding: [0x62,0xf1,0xfd,0x28,0x28,0x17]
-; CHECK-NEXT:    vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
+; CHECK-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %a2 = load <4 x double>, <4 x double>* %ptr_a2
   %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind

Modified: llvm/trunk/test/CodeGen/X86/fma-fneg-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fma-fneg-combine.ll?rev=278431&r1=278430&r2=278431&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fma-fneg-combine.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fma-fneg-combine.ll Thu Aug 11 17:07:33 2016
@@ -61,8 +61,7 @@ define <16 x float> @test5(<16 x float>
 ; CHECK-LABEL: test5:
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    vxorps {{.*}}(%rip), %zmm2, %zmm2
-; CHECK-NEXT:    vfmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
 entry:
   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
@@ -73,8 +72,8 @@ entry:
 define <16 x float> @test6(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
 ; CHECK-LABEL: test6:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vfnmsub213ps {ru-sae}, %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vxorps {{.*}}(%rip), %zmm1, %zmm0
+; CHECK-NEXT:    vfnmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vxorps {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 2) #2




More information about the llvm-commits mailing list