[llvm] e017169 - [X86][NFC] Extract ReplaceableInstrs to a separate file and clang-format X86InstrInfo.cpp

Shengchen Kan via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 30 23:22:12 PST 2023


Author: Shengchen Kan
Date: 2023-12-01T15:21:38+08:00
New Revision: e017169dbd0215e892570e74668f5c3289db3310

URL: https://github.com/llvm/llvm-project/commit/e017169dbd0215e892570e74668f5c3289db3310
DIFF: https://github.com/llvm/llvm-project/commit/e017169dbd0215e892570e74668f5c3289db3310.diff

LOG: [X86][NFC] Extract ReplaceableInstrs to a separate file and clang-format X86InstrInfo.cpp

Added: 
    llvm/lib/Target/X86/X86ReplaceableInstrs.def

Modified: 
    llvm/lib/Target/X86/X86InstrInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b75c00effead01f..583f8ec73a0361f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -58,26 +58,25 @@ static cl::opt<bool>
              cl::desc("Disable fusing of spill code into instructions"),
              cl::Hidden);
 static cl::opt<bool>
-PrintFailedFusing("print-failed-fuse-candidates",
-                  cl::desc("Print instructions that the allocator wants to"
-                           " fuse, but the X86 backend currently can't"),
-                  cl::Hidden);
+    PrintFailedFusing("print-failed-fuse-candidates",
+                      cl::desc("Print instructions that the allocator wants to"
+                               " fuse, but the X86 backend currently can't"),
+                      cl::Hidden);
 static cl::opt<bool>
-ReMatPICStubLoad("remat-pic-stub-load",
-                 cl::desc("Re-materialize load from stub in PIC mode"),
-                 cl::init(false), cl::Hidden);
+    ReMatPICStubLoad("remat-pic-stub-load",
+                     cl::desc("Re-materialize load from stub in PIC mode"),
+                     cl::init(false), cl::Hidden);
 static cl::opt<unsigned>
-PartialRegUpdateClearance("partial-reg-update-clearance",
-                          cl::desc("Clearance between two register writes "
-                                   "for inserting XOR to avoid partial "
-                                   "register update"),
-                          cl::init(64), cl::Hidden);
-static cl::opt<unsigned>
-UndefRegClearance("undef-reg-clearance",
-                  cl::desc("How many idle instructions we would like before "
-                           "certain undef register reads"),
-                  cl::init(128), cl::Hidden);
-
+    PartialRegUpdateClearance("partial-reg-update-clearance",
+                              cl::desc("Clearance between two register writes "
+                                       "for inserting XOR to avoid partial "
+                                       "register update"),
+                              cl::init(64), cl::Hidden);
+static cl::opt<unsigned> UndefRegClearance(
+    "undef-reg-clearance",
+    cl::desc("How many idle instructions we would like before "
+             "certain undef register reads"),
+    cl::init(128), cl::Hidden);
 
 // Pin the vtable to this file.
 void X86InstrInfo::anchor() {}
@@ -87,10 +86,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
                                                : X86::ADJCALLSTACKDOWN32),
                       (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
                                                : X86::ADJCALLSTACKUP32),
-                      X86::CATCHRET,
-                      (STI.is64Bit() ? X86::RET64 : X86::RET32)),
-      Subtarget(STI), RI(STI.getTargetTriple()) {
-}
+                      X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
+      Subtarget(STI), RI(STI.getTargetTriple()) {}
 
 const TargetRegisterClass *
 X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
@@ -123,12 +120,12 @@ X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
   }
 }
 
-bool
-X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
-                                    Register &SrcReg, Register &DstReg,
-                                    unsigned &SubIdx) const {
+bool X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                         Register &SrcReg, Register &DstReg,
+                                         unsigned &SubIdx) const {
   switch (MI.getOpcode()) {
-  default: break;
+  default:
+    break;
   case X86::MOVSX16rr8:
   case X86::MOVZX16rr8:
   case X86::MOVSX32rr8:
@@ -149,7 +146,8 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
     SrcReg = MI.getOperand(1).getReg();
     DstReg = MI.getOperand(0).getReg();
     switch (MI.getOpcode()) {
-    default: llvm_unreachable("Unreachable!");
+    default:
+      llvm_unreachable("Unreachable!");
     case X86::MOVSX16rr8:
     case X86::MOVZX16rr8:
     case X86::MOVSX32rr8:
@@ -441,8 +439,7 @@ int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
     const MachineBasicBlock *MBB = MI.getParent();
     auto I = ++MachineBasicBlock::const_iterator(MI);
     for (auto E = MBB->end(); I != E; ++I) {
-      if (I->getOpcode() == getCallFrameDestroyOpcode() ||
-          I->isCall())
+      if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall())
         break;
     }
 
@@ -764,7 +761,8 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
     return false;
   bool isPICBase = false;
   for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
-         E = MRI.def_instr_end(); I != E; ++I) {
+                                               E = MRI.def_instr_end();
+       I != E; ++I) {
     MachineInstr *DefMI = &*I;
     if (DefMI->getOpcode() != X86::MOVPC32r)
       return false;
@@ -952,9 +950,15 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
     // effects.
     int Value;
     switch (Orig.getOpcode()) {
-    case X86::MOV32r0:  Value = 0; break;
-    case X86::MOV32r1:  Value = 1; break;
-    case X86::MOV32r_1: Value = -1; break;
+    case X86::MOV32r0:
+      Value = 0;
+      break;
+    case X86::MOV32r1:
+      Value = 1;
+      break;
+    case X86::MOV32r_1:
+      Value = -1;
+      break;
     default:
       llvm_unreachable("Unexpected instruction!");
     }
@@ -975,8 +979,8 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
 /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
 bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
   for (const MachineOperand &MO : MI.operands()) {
-    if (MO.isReg() && MO.isDef() &&
-        MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+    if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS &&
+        !MO.isDead()) {
       return true;
     }
   }
@@ -1131,8 +1135,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
   if (AllowSP) {
     RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
   } else {
-    RC = Opc != X86::LEA32r ?
-      &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
+    RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
   }
   Register SrcReg = Src.getReg();
   isKill = MI.killsRegister(SrcReg);
@@ -1195,7 +1198,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   // We handle 8-bit adds and various 16-bit opcodes in the switch below.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
-  assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
+  assert((Is8BitOp ||
+          RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
               *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
          "Unexpected type for LEA transform");
 
@@ -1241,7 +1245,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   MachineInstrBuilder MIB =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
   switch (MIOpc) {
-  default: llvm_unreachable("Unreachable!");
+  default:
+    llvm_unreachable("Unreachable!");
   case X86::SHL8ri:
   case X86::SHL16ri: {
     unsigned ShAmt = MI.getOperand(2).getImm();
@@ -1399,11 +1404,13 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
   unsigned NumRegOperands = 2;
   unsigned MIOpc = MI.getOpcode();
   switch (MIOpc) {
-  default: llvm_unreachable("Unreachable!");
+  default:
+    llvm_unreachable("Unreachable!");
   case X86::SHL64ri: {
     assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+    if (!isTruncatedShiftCountForLEA(ShAmt))
+      return nullptr;
 
     // LEA can't handle RSP.
     if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
@@ -1422,7 +1429,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
   case X86::SHL32ri: {
     assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+    if (!isTruncatedShiftCountForLEA(ShAmt))
+      return nullptr;
 
     unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
@@ -1433,14 +1441,13 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
                         ImplicitOp, LV, LIS))
       return nullptr;
 
-    MachineInstrBuilder MIB =
-        BuildMI(MF, MI.getDebugLoc(), get(Opc))
-            .add(Dest)
-            .addReg(0)
-            .addImm(1LL << ShAmt)
-            .addReg(SrcReg, getKillRegState(isKill))
-            .addImm(0)
-            .addReg(0);
+    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+                                  .add(Dest)
+                                  .addReg(0)
+                                  .addImm(1LL << ShAmt)
+                                  .addReg(SrcReg, getKillRegState(isKill))
+                                  .addImm(0)
+                                  .addReg(0);
     if (ImplicitOp.getReg() != 0)
       MIB.add(ImplicitOp);
     NewMI = MIB;
@@ -1463,18 +1470,18 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
   case X86::INC64r:
   case X86::INC32r: {
     assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
-    unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
-        (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+    unsigned Opc = MIOpc == X86::INC64r
+                       ? X86::LEA64r
+                       : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
     bool isKill;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
                         ImplicitOp, LV, LIS))
       return nullptr;
 
-    MachineInstrBuilder MIB =
-        BuildMI(MF, MI.getDebugLoc(), get(Opc))
-            .add(Dest)
-            .addReg(SrcReg, getKillRegState(isKill));
+    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+                                  .add(Dest)
+                                  .addReg(SrcReg, getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
       MIB.add(ImplicitOp);
 
@@ -1488,8 +1495,9 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
   case X86::DEC64r:
   case X86::DEC32r: {
     assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
-    unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
-        : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+    unsigned Opc = MIOpc == X86::DEC64r
+                       ? X86::LEA64r
+                       : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
 
     bool isKill;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
@@ -1654,8 +1662,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
 
     assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
 
-    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
-                                      get(X86::LEA64r)).add(Dest).add(Src);
+    MachineInstrBuilder MIB =
+        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
     NewMI = addOffset(MIB, -Imm);
     break;
   }
@@ -1666,18 +1674,30 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
   case X86::VMOVDQU16Z128rmk:
   case X86::VMOVDQU16Z256rmk:
   case X86::VMOVDQU16Zrmk:
-  case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
-  case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
-  case X86::VMOVDQU32Zrmk:    case X86::VMOVDQA32Zrmk:
-  case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
-  case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
-  case X86::VMOVDQU64Zrmk:    case X86::VMOVDQA64Zrmk:
-  case X86::VMOVUPDZ128rmk:   case X86::VMOVAPDZ128rmk:
-  case X86::VMOVUPDZ256rmk:   case X86::VMOVAPDZ256rmk:
-  case X86::VMOVUPDZrmk:      case X86::VMOVAPDZrmk:
-  case X86::VMOVUPSZ128rmk:   case X86::VMOVAPSZ128rmk:
-  case X86::VMOVUPSZ256rmk:   case X86::VMOVAPSZ256rmk:
-  case X86::VMOVUPSZrmk:      case X86::VMOVAPSZrmk:
+  case X86::VMOVDQU32Z128rmk:
+  case X86::VMOVDQA32Z128rmk:
+  case X86::VMOVDQU32Z256rmk:
+  case X86::VMOVDQA32Z256rmk:
+  case X86::VMOVDQU32Zrmk:
+  case X86::VMOVDQA32Zrmk:
+  case X86::VMOVDQU64Z128rmk:
+  case X86::VMOVDQA64Z128rmk:
+  case X86::VMOVDQU64Z256rmk:
+  case X86::VMOVDQA64Z256rmk:
+  case X86::VMOVDQU64Zrmk:
+  case X86::VMOVDQA64Zrmk:
+  case X86::VMOVUPDZ128rmk:
+  case X86::VMOVAPDZ128rmk:
+  case X86::VMOVUPDZ256rmk:
+  case X86::VMOVAPDZ256rmk:
+  case X86::VMOVUPDZrmk:
+  case X86::VMOVAPDZrmk:
+  case X86::VMOVUPSZ128rmk:
+  case X86::VMOVAPSZ128rmk:
+  case X86::VMOVUPSZ256rmk:
+  case X86::VMOVAPSZ256rmk:
+  case X86::VMOVUPSZrmk:
+  case X86::VMOVAPSZrmk:
   case X86::VBROADCASTSDZ256rmk:
   case X86::VBROADCASTSDZrmk:
   case X86::VBROADCASTSSZ128rmk:
@@ -1691,59 +1711,142 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
   case X86::VPBROADCASTQZrmk: {
     unsigned Opc;
     switch (MIOpc) {
-    default: llvm_unreachable("Unreachable!");
-    case X86::VMOVDQU8Z128rmk:     Opc = X86::VPBLENDMBZ128rmk; break;
-    case X86::VMOVDQU8Z256rmk:     Opc = X86::VPBLENDMBZ256rmk; break;
-    case X86::VMOVDQU8Zrmk:        Opc = X86::VPBLENDMBZrmk;    break;
-    case X86::VMOVDQU16Z128rmk:    Opc = X86::VPBLENDMWZ128rmk; break;
-    case X86::VMOVDQU16Z256rmk:    Opc = X86::VPBLENDMWZ256rmk; break;
-    case X86::VMOVDQU16Zrmk:       Opc = X86::VPBLENDMWZrmk;    break;
-    case X86::VMOVDQU32Z128rmk:    Opc = X86::VPBLENDMDZ128rmk; break;
-    case X86::VMOVDQU32Z256rmk:    Opc = X86::VPBLENDMDZ256rmk; break;
-    case X86::VMOVDQU32Zrmk:       Opc = X86::VPBLENDMDZrmk;    break;
-    case X86::VMOVDQU64Z128rmk:    Opc = X86::VPBLENDMQZ128rmk; break;
-    case X86::VMOVDQU64Z256rmk:    Opc = X86::VPBLENDMQZ256rmk; break;
-    case X86::VMOVDQU64Zrmk:       Opc = X86::VPBLENDMQZrmk;    break;
-    case X86::VMOVUPDZ128rmk:      Opc = X86::VBLENDMPDZ128rmk; break;
-    case X86::VMOVUPDZ256rmk:      Opc = X86::VBLENDMPDZ256rmk; break;
-    case X86::VMOVUPDZrmk:         Opc = X86::VBLENDMPDZrmk;    break;
-    case X86::VMOVUPSZ128rmk:      Opc = X86::VBLENDMPSZ128rmk; break;
-    case X86::VMOVUPSZ256rmk:      Opc = X86::VBLENDMPSZ256rmk; break;
-    case X86::VMOVUPSZrmk:         Opc = X86::VBLENDMPSZrmk;    break;
-    case X86::VMOVDQA32Z128rmk:    Opc = X86::VPBLENDMDZ128rmk; break;
-    case X86::VMOVDQA32Z256rmk:    Opc = X86::VPBLENDMDZ256rmk; break;
-    case X86::VMOVDQA32Zrmk:       Opc = X86::VPBLENDMDZrmk;    break;
-    case X86::VMOVDQA64Z128rmk:    Opc = X86::VPBLENDMQZ128rmk; break;
-    case X86::VMOVDQA64Z256rmk:    Opc = X86::VPBLENDMQZ256rmk; break;
-    case X86::VMOVDQA64Zrmk:       Opc = X86::VPBLENDMQZrmk;    break;
-    case X86::VMOVAPDZ128rmk:      Opc = X86::VBLENDMPDZ128rmk; break;
-    case X86::VMOVAPDZ256rmk:      Opc = X86::VBLENDMPDZ256rmk; break;
-    case X86::VMOVAPDZrmk:         Opc = X86::VBLENDMPDZrmk;    break;
-    case X86::VMOVAPSZ128rmk:      Opc = X86::VBLENDMPSZ128rmk; break;
-    case X86::VMOVAPSZ256rmk:      Opc = X86::VBLENDMPSZ256rmk; break;
-    case X86::VMOVAPSZrmk:         Opc = X86::VBLENDMPSZrmk;    break;
-    case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break;
-    case X86::VBROADCASTSDZrmk:    Opc = X86::VBLENDMPDZrmbk;    break;
-    case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break;
-    case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break;
-    case X86::VBROADCASTSSZrmk:    Opc = X86::VBLENDMPSZrmbk;    break;
-    case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break;
-    case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break;
-    case X86::VPBROADCASTDZrmk:    Opc = X86::VPBLENDMDZrmbk;    break;
-    case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break;
-    case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break;
-    case X86::VPBROADCASTQZrmk:    Opc = X86::VPBLENDMQZrmbk;    break;
+    default:
+      llvm_unreachable("Unreachable!");
+    case X86::VMOVDQU8Z128rmk:
+      Opc = X86::VPBLENDMBZ128rmk;
+      break;
+    case X86::VMOVDQU8Z256rmk:
+      Opc = X86::VPBLENDMBZ256rmk;
+      break;
+    case X86::VMOVDQU8Zrmk:
+      Opc = X86::VPBLENDMBZrmk;
+      break;
+    case X86::VMOVDQU16Z128rmk:
+      Opc = X86::VPBLENDMWZ128rmk;
+      break;
+    case X86::VMOVDQU16Z256rmk:
+      Opc = X86::VPBLENDMWZ256rmk;
+      break;
+    case X86::VMOVDQU16Zrmk:
+      Opc = X86::VPBLENDMWZrmk;
+      break;
+    case X86::VMOVDQU32Z128rmk:
+      Opc = X86::VPBLENDMDZ128rmk;
+      break;
+    case X86::VMOVDQU32Z256rmk:
+      Opc = X86::VPBLENDMDZ256rmk;
+      break;
+    case X86::VMOVDQU32Zrmk:
+      Opc = X86::VPBLENDMDZrmk;
+      break;
+    case X86::VMOVDQU64Z128rmk:
+      Opc = X86::VPBLENDMQZ128rmk;
+      break;
+    case X86::VMOVDQU64Z256rmk:
+      Opc = X86::VPBLENDMQZ256rmk;
+      break;
+    case X86::VMOVDQU64Zrmk:
+      Opc = X86::VPBLENDMQZrmk;
+      break;
+    case X86::VMOVUPDZ128rmk:
+      Opc = X86::VBLENDMPDZ128rmk;
+      break;
+    case X86::VMOVUPDZ256rmk:
+      Opc = X86::VBLENDMPDZ256rmk;
+      break;
+    case X86::VMOVUPDZrmk:
+      Opc = X86::VBLENDMPDZrmk;
+      break;
+    case X86::VMOVUPSZ128rmk:
+      Opc = X86::VBLENDMPSZ128rmk;
+      break;
+    case X86::VMOVUPSZ256rmk:
+      Opc = X86::VBLENDMPSZ256rmk;
+      break;
+    case X86::VMOVUPSZrmk:
+      Opc = X86::VBLENDMPSZrmk;
+      break;
+    case X86::VMOVDQA32Z128rmk:
+      Opc = X86::VPBLENDMDZ128rmk;
+      break;
+    case X86::VMOVDQA32Z256rmk:
+      Opc = X86::VPBLENDMDZ256rmk;
+      break;
+    case X86::VMOVDQA32Zrmk:
+      Opc = X86::VPBLENDMDZrmk;
+      break;
+    case X86::VMOVDQA64Z128rmk:
+      Opc = X86::VPBLENDMQZ128rmk;
+      break;
+    case X86::VMOVDQA64Z256rmk:
+      Opc = X86::VPBLENDMQZ256rmk;
+      break;
+    case X86::VMOVDQA64Zrmk:
+      Opc = X86::VPBLENDMQZrmk;
+      break;
+    case X86::VMOVAPDZ128rmk:
+      Opc = X86::VBLENDMPDZ128rmk;
+      break;
+    case X86::VMOVAPDZ256rmk:
+      Opc = X86::VBLENDMPDZ256rmk;
+      break;
+    case X86::VMOVAPDZrmk:
+      Opc = X86::VBLENDMPDZrmk;
+      break;
+    case X86::VMOVAPSZ128rmk:
+      Opc = X86::VBLENDMPSZ128rmk;
+      break;
+    case X86::VMOVAPSZ256rmk:
+      Opc = X86::VBLENDMPSZ256rmk;
+      break;
+    case X86::VMOVAPSZrmk:
+      Opc = X86::VBLENDMPSZrmk;
+      break;
+    case X86::VBROADCASTSDZ256rmk:
+      Opc = X86::VBLENDMPDZ256rmbk;
+      break;
+    case X86::VBROADCASTSDZrmk:
+      Opc = X86::VBLENDMPDZrmbk;
+      break;
+    case X86::VBROADCASTSSZ128rmk:
+      Opc = X86::VBLENDMPSZ128rmbk;
+      break;
+    case X86::VBROADCASTSSZ256rmk:
+      Opc = X86::VBLENDMPSZ256rmbk;
+      break;
+    case X86::VBROADCASTSSZrmk:
+      Opc = X86::VBLENDMPSZrmbk;
+      break;
+    case X86::VPBROADCASTDZ128rmk:
+      Opc = X86::VPBLENDMDZ128rmbk;
+      break;
+    case X86::VPBROADCASTDZ256rmk:
+      Opc = X86::VPBLENDMDZ256rmbk;
+      break;
+    case X86::VPBROADCASTDZrmk:
+      Opc = X86::VPBLENDMDZrmbk;
+      break;
+    case X86::VPBROADCASTQZ128rmk:
+      Opc = X86::VPBLENDMQZ128rmbk;
+      break;
+    case X86::VPBROADCASTQZ256rmk:
+      Opc = X86::VPBLENDMQZ256rmbk;
+      break;
+    case X86::VPBROADCASTQZrmk:
+      Opc = X86::VPBLENDMQZrmbk;
+      break;
     }
 
     NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-              .add(Dest)
-              .add(MI.getOperand(2))
-              .add(Src)
-              .add(MI.getOperand(3))
-              .add(MI.getOperand(4))
-              .add(MI.getOperand(5))
-              .add(MI.getOperand(6))
-              .add(MI.getOperand(7));
+                .add(Dest)
+                .add(MI.getOperand(2))
+                .add(Src)
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4))
+                .add(MI.getOperand(5))
+                .add(MI.getOperand(6))
+                .add(MI.getOperand(7));
     NumRegOperands = 4;
     break;
   }
@@ -1754,66 +1857,140 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
   case X86::VMOVDQU16Z128rrk:
   case X86::VMOVDQU16Z256rrk:
   case X86::VMOVDQU16Zrrk:
-  case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
-  case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
-  case X86::VMOVDQU32Zrrk:    case X86::VMOVDQA32Zrrk:
-  case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
-  case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
-  case X86::VMOVDQU64Zrrk:    case X86::VMOVDQA64Zrrk:
-  case X86::VMOVUPDZ128rrk:   case X86::VMOVAPDZ128rrk:
-  case X86::VMOVUPDZ256rrk:   case X86::VMOVAPDZ256rrk:
-  case X86::VMOVUPDZrrk:      case X86::VMOVAPDZrrk:
-  case X86::VMOVUPSZ128rrk:   case X86::VMOVAPSZ128rrk:
-  case X86::VMOVUPSZ256rrk:   case X86::VMOVAPSZ256rrk:
-  case X86::VMOVUPSZrrk:      case X86::VMOVAPSZrrk: {
+  case X86::VMOVDQU32Z128rrk:
+  case X86::VMOVDQA32Z128rrk:
+  case X86::VMOVDQU32Z256rrk:
+  case X86::VMOVDQA32Z256rrk:
+  case X86::VMOVDQU32Zrrk:
+  case X86::VMOVDQA32Zrrk:
+  case X86::VMOVDQU64Z128rrk:
+  case X86::VMOVDQA64Z128rrk:
+  case X86::VMOVDQU64Z256rrk:
+  case X86::VMOVDQA64Z256rrk:
+  case X86::VMOVDQU64Zrrk:
+  case X86::VMOVDQA64Zrrk:
+  case X86::VMOVUPDZ128rrk:
+  case X86::VMOVAPDZ128rrk:
+  case X86::VMOVUPDZ256rrk:
+  case X86::VMOVAPDZ256rrk:
+  case X86::VMOVUPDZrrk:
+  case X86::VMOVAPDZrrk:
+  case X86::VMOVUPSZ128rrk:
+  case X86::VMOVAPSZ128rrk:
+  case X86::VMOVUPSZ256rrk:
+  case X86::VMOVAPSZ256rrk:
+  case X86::VMOVUPSZrrk:
+  case X86::VMOVAPSZrrk: {
     unsigned Opc;
     switch (MIOpc) {
-    default: llvm_unreachable("Unreachable!");
-    case X86::VMOVDQU8Z128rrk:  Opc = X86::VPBLENDMBZ128rrk; break;
-    case X86::VMOVDQU8Z256rrk:  Opc = X86::VPBLENDMBZ256rrk; break;
-    case X86::VMOVDQU8Zrrk:     Opc = X86::VPBLENDMBZrrk;    break;
-    case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
-    case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
-    case X86::VMOVDQU16Zrrk:    Opc = X86::VPBLENDMWZrrk;    break;
-    case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
-    case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
-    case X86::VMOVDQU32Zrrk:    Opc = X86::VPBLENDMDZrrk;    break;
-    case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
-    case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
-    case X86::VMOVDQU64Zrrk:    Opc = X86::VPBLENDMQZrrk;    break;
-    case X86::VMOVUPDZ128rrk:   Opc = X86::VBLENDMPDZ128rrk; break;
-    case X86::VMOVUPDZ256rrk:   Opc = X86::VBLENDMPDZ256rrk; break;
-    case X86::VMOVUPDZrrk:      Opc = X86::VBLENDMPDZrrk;    break;
-    case X86::VMOVUPSZ128rrk:   Opc = X86::VBLENDMPSZ128rrk; break;
-    case X86::VMOVUPSZ256rrk:   Opc = X86::VBLENDMPSZ256rrk; break;
-    case X86::VMOVUPSZrrk:      Opc = X86::VBLENDMPSZrrk;    break;
-    case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
-    case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
-    case X86::VMOVDQA32Zrrk:    Opc = X86::VPBLENDMDZrrk;    break;
-    case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
-    case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
-    case X86::VMOVDQA64Zrrk:    Opc = X86::VPBLENDMQZrrk;    break;
-    case X86::VMOVAPDZ128rrk:   Opc = X86::VBLENDMPDZ128rrk; break;
-    case X86::VMOVAPDZ256rrk:   Opc = X86::VBLENDMPDZ256rrk; break;
-    case X86::VMOVAPDZrrk:      Opc = X86::VBLENDMPDZrrk;    break;
-    case X86::VMOVAPSZ128rrk:   Opc = X86::VBLENDMPSZ128rrk; break;
-    case X86::VMOVAPSZ256rrk:   Opc = X86::VBLENDMPSZ256rrk; break;
-    case X86::VMOVAPSZrrk:      Opc = X86::VBLENDMPSZrrk;    break;
+    default:
+      llvm_unreachable("Unreachable!");
+    case X86::VMOVDQU8Z128rrk:
+      Opc = X86::VPBLENDMBZ128rrk;
+      break;
+    case X86::VMOVDQU8Z256rrk:
+      Opc = X86::VPBLENDMBZ256rrk;
+      break;
+    case X86::VMOVDQU8Zrrk:
+      Opc = X86::VPBLENDMBZrrk;
+      break;
+    case X86::VMOVDQU16Z128rrk:
+      Opc = X86::VPBLENDMWZ128rrk;
+      break;
+    case X86::VMOVDQU16Z256rrk:
+      Opc = X86::VPBLENDMWZ256rrk;
+      break;
+    case X86::VMOVDQU16Zrrk:
+      Opc = X86::VPBLENDMWZrrk;
+      break;
+    case X86::VMOVDQU32Z128rrk:
+      Opc = X86::VPBLENDMDZ128rrk;
+      break;
+    case X86::VMOVDQU32Z256rrk:
+      Opc = X86::VPBLENDMDZ256rrk;
+      break;
+    case X86::VMOVDQU32Zrrk:
+      Opc = X86::VPBLENDMDZrrk;
+      break;
+    case X86::VMOVDQU64Z128rrk:
+      Opc = X86::VPBLENDMQZ128rrk;
+      break;
+    case X86::VMOVDQU64Z256rrk:
+      Opc = X86::VPBLENDMQZ256rrk;
+      break;
+    case X86::VMOVDQU64Zrrk:
+      Opc = X86::VPBLENDMQZrrk;
+      break;
+    case X86::VMOVUPDZ128rrk:
+      Opc = X86::VBLENDMPDZ128rrk;
+      break;
+    case X86::VMOVUPDZ256rrk:
+      Opc = X86::VBLENDMPDZ256rrk;
+      break;
+    case X86::VMOVUPDZrrk:
+      Opc = X86::VBLENDMPDZrrk;
+      break;
+    case X86::VMOVUPSZ128rrk:
+      Opc = X86::VBLENDMPSZ128rrk;
+      break;
+    case X86::VMOVUPSZ256rrk:
+      Opc = X86::VBLENDMPSZ256rrk;
+      break;
+    case X86::VMOVUPSZrrk:
+      Opc = X86::VBLENDMPSZrrk;
+      break;
+    case X86::VMOVDQA32Z128rrk:
+      Opc = X86::VPBLENDMDZ128rrk;
+      break;
+    case X86::VMOVDQA32Z256rrk:
+      Opc = X86::VPBLENDMDZ256rrk;
+      break;
+    case X86::VMOVDQA32Zrrk:
+      Opc = X86::VPBLENDMDZrrk;
+      break;
+    case X86::VMOVDQA64Z128rrk:
+      Opc = X86::VPBLENDMQZ128rrk;
+      break;
+    case X86::VMOVDQA64Z256rrk:
+      Opc = X86::VPBLENDMQZ256rrk;
+      break;
+    case X86::VMOVDQA64Zrrk:
+      Opc = X86::VPBLENDMQZrrk;
+      break;
+    case X86::VMOVAPDZ128rrk:
+      Opc = X86::VBLENDMPDZ128rrk;
+      break;
+    case X86::VMOVAPDZ256rrk:
+      Opc = X86::VBLENDMPDZ256rrk;
+      break;
+    case X86::VMOVAPDZrrk:
+      Opc = X86::VBLENDMPDZrrk;
+      break;
+    case X86::VMOVAPSZ128rrk:
+      Opc = X86::VBLENDMPSZ128rrk;
+      break;
+    case X86::VMOVAPSZ256rrk:
+      Opc = X86::VBLENDMPSZ256rrk;
+      break;
+    case X86::VMOVAPSZrrk:
+      Opc = X86::VBLENDMPSZrrk;
+      break;
     }
 
     NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-              .add(Dest)
-              .add(MI.getOperand(2))
-              .add(Src)
-              .add(MI.getOperand(3));
+                .add(Dest)
+                .add(MI.getOperand(2))
+                .add(Src)
+                .add(MI.getOperand(3));
     NumRegOperands = 4;
     break;
   }
   }
 
-  if (!NewMI) return nullptr;
+  if (!NewMI)
+    return nullptr;
 
-  if (LV) {  // Update live variables
+  if (LV) { // Update live variables
     for (unsigned I = 0; I < NumRegOperands; ++I) {
       MachineOperand &Op = MI.getOperand(I);
       if (Op.isReg() && (Op.isDead() || Op.isKill()))
@@ -1879,8 +2056,8 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
          "Intrinsic instructions can't commute operand 1");
 
   // Determine which case this commute is or if it can't be done.
-  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
-                                         SrcOpIdx2);
+  unsigned Case =
+      getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
   assert(Case < 3 && "Unexpected case number!");
 
   // Define the FMA forms mapping array that helps to map input FMA form
@@ -1890,22 +2067,21 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
   const unsigned Form213Index = 1;
   const unsigned Form231Index = 2;
   static const unsigned FormMapping[][3] = {
-    // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
-    // FMA132 A, C, b; ==> FMA231 C, A, b;
-    // FMA213 B, A, c; ==> FMA213 A, B, c;
-    // FMA231 C, A, b; ==> FMA132 A, C, b;
-    { Form231Index, Form213Index, Form132Index },
-    // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
-    // FMA132 A, c, B; ==> FMA132 B, c, A;
-    // FMA213 B, a, C; ==> FMA231 C, a, B;
-    // FMA231 C, a, B; ==> FMA213 B, a, C;
-    { Form132Index, Form231Index, Form213Index },
-    // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
-    // FMA132 a, C, B; ==> FMA213 a, B, C;
-    // FMA213 b, A, C; ==> FMA132 b, C, A;
-    // FMA231 c, A, B; ==> FMA231 c, B, A;
-    { Form213Index, Form132Index, Form231Index }
-  };
+      // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
+      // FMA132 A, C, b; ==> FMA231 C, A, b;
+      // FMA213 B, A, c; ==> FMA213 A, B, c;
+      // FMA231 C, A, b; ==> FMA132 A, C, b;
+      {Form231Index, Form213Index, Form132Index},
+      // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
+      // FMA132 A, c, B; ==> FMA132 B, c, A;
+      // FMA213 B, a, C; ==> FMA231 C, a, B;
+      // FMA231 C, a, B; ==> FMA213 B, a, C;
+      {Form132Index, Form231Index, Form213Index},
+      // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
+      // FMA132 a, C, B; ==> FMA213 a, B, C;
+      // FMA213 b, A, C; ==> FMA132 b, C, A;
+      // FMA231 c, A, B; ==> FMA231 c, B, A;
+      {Form213Index, Form132Index, Form231Index}};
 
   unsigned FMAForms[3];
   FMAForms[0] = FMA3Group.get132Opcode();
@@ -1923,63 +2099,86 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
 static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
                              unsigned SrcOpIdx2) {
   // Determine which case this commute is or if it can't be done.
-  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
-                                         SrcOpIdx2);
+  unsigned Case =
+      getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
   assert(Case < 3 && "Unexpected case value!");
 
   // For each case we need to swap two pairs of bits in the final immediate.
   static const uint8_t SwapMasks[3][4] = {
-    { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
-    { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
-    { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
+      {0x04, 0x10, 0x08, 0x20}, // Swap bits 2/4 and 3/5.
+      {0x02, 0x10, 0x08, 0x40}, // Swap bits 1/4 and 3/6.
+      {0x02, 0x04, 0x20, 0x40}, // Swap bits 1/2 and 5/6.
   };
 
-  uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
+  uint8_t Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
   // Clear out the bits we are swapping.
   uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
                            SwapMasks[Case][2] | SwapMasks[Case][3]);
   // If the immediate had a bit of the pair set, then set the opposite bit.
-  if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
-  if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
-  if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
-  if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
-  MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
+  if (Imm & SwapMasks[Case][0])
+    NewImm |= SwapMasks[Case][1];
+  if (Imm & SwapMasks[Case][1])
+    NewImm |= SwapMasks[Case][0];
+  if (Imm & SwapMasks[Case][2])
+    NewImm |= SwapMasks[Case][3];
+  if (Imm & SwapMasks[Case][3])
+    NewImm |= SwapMasks[Case][2];
+  MI.getOperand(MI.getNumOperands() - 1).setImm(NewImm);
 }
 
 // Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
 // commuted.
 static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
-#define VPERM_CASES(Suffix) \
-  case X86::VPERMI2##Suffix##128rr:    case X86::VPERMT2##Suffix##128rr:    \
-  case X86::VPERMI2##Suffix##256rr:    case X86::VPERMT2##Suffix##256rr:    \
-  case X86::VPERMI2##Suffix##rr:       case X86::VPERMT2##Suffix##rr:       \
-  case X86::VPERMI2##Suffix##128rm:    case X86::VPERMT2##Suffix##128rm:    \
-  case X86::VPERMI2##Suffix##256rm:    case X86::VPERMT2##Suffix##256rm:    \
-  case X86::VPERMI2##Suffix##rm:       case X86::VPERMT2##Suffix##rm:       \
-  case X86::VPERMI2##Suffix##128rrkz:  case X86::VPERMT2##Suffix##128rrkz:  \
-  case X86::VPERMI2##Suffix##256rrkz:  case X86::VPERMT2##Suffix##256rrkz:  \
-  case X86::VPERMI2##Suffix##rrkz:     case X86::VPERMT2##Suffix##rrkz:     \
-  case X86::VPERMI2##Suffix##128rmkz:  case X86::VPERMT2##Suffix##128rmkz:  \
-  case X86::VPERMI2##Suffix##256rmkz:  case X86::VPERMT2##Suffix##256rmkz:  \
-  case X86::VPERMI2##Suffix##rmkz:     case X86::VPERMT2##Suffix##rmkz:
-
-#define VPERM_CASES_BROADCAST(Suffix) \
-  VPERM_CASES(Suffix) \
-  case X86::VPERMI2##Suffix##128rmb:   case X86::VPERMT2##Suffix##128rmb:   \
-  case X86::VPERMI2##Suffix##256rmb:   case X86::VPERMT2##Suffix##256rmb:   \
-  case X86::VPERMI2##Suffix##rmb:      case X86::VPERMT2##Suffix##rmb:      \
-  case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
-  case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
-  case X86::VPERMI2##Suffix##rmbkz:    case X86::VPERMT2##Suffix##rmbkz:
+#define VPERM_CASES(Suffix)                                                    \
+  case X86::VPERMI2##Suffix##128rr:                                            \
+  case X86::VPERMT2##Suffix##128rr:                                            \
+  case X86::VPERMI2##Suffix##256rr:                                            \
+  case X86::VPERMT2##Suffix##256rr:                                            \
+  case X86::VPERMI2##Suffix##rr:                                               \
+  case X86::VPERMT2##Suffix##rr:                                               \
+  case X86::VPERMI2##Suffix##128rm:                                            \
+  case X86::VPERMT2##Suffix##128rm:                                            \
+  case X86::VPERMI2##Suffix##256rm:                                            \
+  case X86::VPERMT2##Suffix##256rm:                                            \
+  case X86::VPERMI2##Suffix##rm:                                               \
+  case X86::VPERMT2##Suffix##rm:                                               \
+  case X86::VPERMI2##Suffix##128rrkz:                                          \
+  case X86::VPERMT2##Suffix##128rrkz:                                          \
+  case X86::VPERMI2##Suffix##256rrkz:                                          \
+  case X86::VPERMT2##Suffix##256rrkz:                                          \
+  case X86::VPERMI2##Suffix##rrkz:                                             \
+  case X86::VPERMT2##Suffix##rrkz:                                             \
+  case X86::VPERMI2##Suffix##128rmkz:                                          \
+  case X86::VPERMT2##Suffix##128rmkz:                                          \
+  case X86::VPERMI2##Suffix##256rmkz:                                          \
+  case X86::VPERMT2##Suffix##256rmkz:                                          \
+  case X86::VPERMI2##Suffix##rmkz:                                             \
+  case X86::VPERMT2##Suffix##rmkz:
+
+#define VPERM_CASES_BROADCAST(Suffix)                                          \
+  VPERM_CASES(Suffix)                                                          \
+  case X86::VPERMI2##Suffix##128rmb:                                           \
+  case X86::VPERMT2##Suffix##128rmb:                                           \
+  case X86::VPERMI2##Suffix##256rmb:                                           \
+  case X86::VPERMT2##Suffix##256rmb:                                           \
+  case X86::VPERMI2##Suffix##rmb:                                              \
+  case X86::VPERMT2##Suffix##rmb:                                              \
+  case X86::VPERMI2##Suffix##128rmbkz:                                         \
+  case X86::VPERMT2##Suffix##128rmbkz:                                         \
+  case X86::VPERMI2##Suffix##256rmbkz:                                         \
+  case X86::VPERMT2##Suffix##256rmbkz:                                         \
+  case X86::VPERMI2##Suffix##rmbkz:                                            \
+  case X86::VPERMT2##Suffix##rmbkz:
 
   switch (Opcode) {
-  default: return false;
-  VPERM_CASES(B)
-  VPERM_CASES_BROADCAST(D)
-  VPERM_CASES_BROADCAST(PD)
-  VPERM_CASES_BROADCAST(PS)
-  VPERM_CASES_BROADCAST(Q)
-  VPERM_CASES(W)
+  default:
+    return false;
+    VPERM_CASES(B)
+    VPERM_CASES_BROADCAST(D)
+    VPERM_CASES_BROADCAST(PD)
+    VPERM_CASES_BROADCAST(PS)
+    VPERM_CASES_BROADCAST(Q)
+    VPERM_CASES(W)
     return true;
   }
 #undef VPERM_CASES_BROADCAST
@@ -1989,42 +2188,60 @@ static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
 // Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
 // from the I opcode to the T opcode and vice versa.
 static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
-#define VPERM_CASES(Orig, New) \
-  case X86::Orig##128rr:    return X86::New##128rr;   \
-  case X86::Orig##128rrkz:  return X86::New##128rrkz; \
-  case X86::Orig##128rm:    return X86::New##128rm;   \
-  case X86::Orig##128rmkz:  return X86::New##128rmkz; \
-  case X86::Orig##256rr:    return X86::New##256rr;   \
-  case X86::Orig##256rrkz:  return X86::New##256rrkz; \
-  case X86::Orig##256rm:    return X86::New##256rm;   \
-  case X86::Orig##256rmkz:  return X86::New##256rmkz; \
-  case X86::Orig##rr:       return X86::New##rr;      \
-  case X86::Orig##rrkz:     return X86::New##rrkz;    \
-  case X86::Orig##rm:       return X86::New##rm;      \
-  case X86::Orig##rmkz:     return X86::New##rmkz;
-
-#define VPERM_CASES_BROADCAST(Orig, New) \
-  VPERM_CASES(Orig, New) \
-  case X86::Orig##128rmb:   return X86::New##128rmb;   \
-  case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
-  case X86::Orig##256rmb:   return X86::New##256rmb;   \
-  case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
-  case X86::Orig##rmb:      return X86::New##rmb;      \
-  case X86::Orig##rmbkz:    return X86::New##rmbkz;
+#define VPERM_CASES(Orig, New)                                                 \
+  case X86::Orig##128rr:                                                       \
+    return X86::New##128rr;                                                    \
+  case X86::Orig##128rrkz:                                                     \
+    return X86::New##128rrkz;                                                  \
+  case X86::Orig##128rm:                                                       \
+    return X86::New##128rm;                                                    \
+  case X86::Orig##128rmkz:                                                     \
+    return X86::New##128rmkz;                                                  \
+  case X86::Orig##256rr:                                                       \
+    return X86::New##256rr;                                                    \
+  case X86::Orig##256rrkz:                                                     \
+    return X86::New##256rrkz;                                                  \
+  case X86::Orig##256rm:                                                       \
+    return X86::New##256rm;                                                    \
+  case X86::Orig##256rmkz:                                                     \
+    return X86::New##256rmkz;                                                  \
+  case X86::Orig##rr:                                                          \
+    return X86::New##rr;                                                       \
+  case X86::Orig##rrkz:                                                        \
+    return X86::New##rrkz;                                                     \
+  case X86::Orig##rm:                                                          \
+    return X86::New##rm;                                                       \
+  case X86::Orig##rmkz:                                                        \
+    return X86::New##rmkz;
+
+#define VPERM_CASES_BROADCAST(Orig, New)                                       \
+  VPERM_CASES(Orig, New)                                                       \
+  case X86::Orig##128rmb:                                                      \
+    return X86::New##128rmb;                                                   \
+  case X86::Orig##128rmbkz:                                                    \
+    return X86::New##128rmbkz;                                                 \
+  case X86::Orig##256rmb:                                                      \
+    return X86::New##256rmb;                                                   \
+  case X86::Orig##256rmbkz:                                                    \
+    return X86::New##256rmbkz;                                                 \
+  case X86::Orig##rmb:                                                         \
+    return X86::New##rmb;                                                      \
+  case X86::Orig##rmbkz:                                                       \
+    return X86::New##rmbkz;
 
   switch (Opcode) {
-  VPERM_CASES(VPERMI2B, VPERMT2B)
-  VPERM_CASES_BROADCAST(VPERMI2D,  VPERMT2D)
-  VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
-  VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
-  VPERM_CASES_BROADCAST(VPERMI2Q,  VPERMT2Q)
-  VPERM_CASES(VPERMI2W, VPERMT2W)
-  VPERM_CASES(VPERMT2B, VPERMI2B)
-  VPERM_CASES_BROADCAST(VPERMT2D,  VPERMI2D)
-  VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
-  VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
-  VPERM_CASES_BROADCAST(VPERMT2Q,  VPERMI2Q)
-  VPERM_CASES(VPERMT2W, VPERMI2W)
+    VPERM_CASES(VPERMI2B, VPERMT2B)
+    VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
+    VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
+    VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
+    VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
+    VPERM_CASES(VPERMI2W, VPERMT2W)
+    VPERM_CASES(VPERMT2B, VPERMI2B)
+    VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
+    VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
+    VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
+    VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
+    VPERM_CASES(VPERMT2W, VPERMI2W)
   }
 
   llvm_unreachable("Unreachable!");
@@ -2047,17 +2264,37 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
   case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
   case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
-  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+  case X86::SHLD64rri8: { // A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B,
+                          // (64-I)
     unsigned Opc;
     unsigned Size;
     switch (MI.getOpcode()) {
-    default: llvm_unreachable("Unreachable!");
-    case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
-    case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
-    case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
-    case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
-    case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
-    case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+    default:
+      llvm_unreachable("Unreachable!");
+    case X86::SHRD16rri8:
+      Size = 16;
+      Opc = X86::SHLD16rri8;
+      break;
+    case X86::SHLD16rri8:
+      Size = 16;
+      Opc = X86::SHRD16rri8;
+      break;
+    case X86::SHRD32rri8:
+      Size = 32;
+      Opc = X86::SHLD32rri8;
+      break;
+    case X86::SHLD32rri8:
+      Size = 32;
+      Opc = X86::SHRD32rri8;
+      break;
+    case X86::SHRD64rri8:
+      Size = 64;
+      Opc = X86::SHLD64rri8;
+      break;
+    case X86::SHLD64rri8:
+      Size = 64;
+      Opc = X86::SHRD64rri8;
+      break;
     }
     unsigned Amt = MI.getOperand(3).getImm();
     auto &WorkingMI = cloneIfNew(MI);
@@ -2085,19 +2322,32 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
       unsigned Mask, Opc;
       switch (MI.getOpcode()) {
-      default: llvm_unreachable("Unreachable!");
-      case X86::BLENDPDrri:  Opc = X86::MOVSDrr;  Mask = 0x03; break;
-      case X86::BLENDPSrri:  Opc = X86::MOVSSrr;  Mask = 0x0F; break;
-      case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break;
-      case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break;
+      default:
+        llvm_unreachable("Unreachable!");
+      case X86::BLENDPDrri:
+        Opc = X86::MOVSDrr;
+        Mask = 0x03;
+        break;
+      case X86::BLENDPSrri:
+        Opc = X86::MOVSSrr;
+        Mask = 0x0F;
+        break;
+      case X86::VBLENDPDrri:
+        Opc = X86::VMOVSDrr;
+        Mask = 0x03;
+        break;
+      case X86::VBLENDPSrri:
+        Opc = X86::VMOVSSrr;
+        Mask = 0x0F;
+        break;
       }
       if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
         auto &WorkingMI = cloneIfNew(MI);
         WorkingMI.setDesc(get(Opc));
         WorkingMI.removeOperand(3);
         return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
-                                                       /*NewMI=*/false,
-                                                       OpIdx1, OpIdx2);
+                                                       /*NewMI=*/false, OpIdx1,
+                                                       OpIdx2);
       }
     }
     [[fallthrough]];
@@ -2107,21 +2357,44 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VPBLENDDrri:
   case X86::VPBLENDWrri:
   case X86::VPBLENDDYrri:
-  case X86::VPBLENDWYrri:{
+  case X86::VPBLENDWYrri: {
     int8_t Mask;
     switch (MI.getOpcode()) {
-    default: llvm_unreachable("Unreachable!");
-    case X86::BLENDPDrri:    Mask = (int8_t)0x03; break;
-    case X86::BLENDPSrri:    Mask = (int8_t)0x0F; break;
-    case X86::PBLENDWrri:    Mask = (int8_t)0xFF; break;
-    case X86::VBLENDPDrri:   Mask = (int8_t)0x03; break;
-    case X86::VBLENDPSrri:   Mask = (int8_t)0x0F; break;
-    case X86::VBLENDPDYrri:  Mask = (int8_t)0x0F; break;
-    case X86::VBLENDPSYrri:  Mask = (int8_t)0xFF; break;
-    case X86::VPBLENDDrri:   Mask = (int8_t)0x0F; break;
-    case X86::VPBLENDWrri:   Mask = (int8_t)0xFF; break;
-    case X86::VPBLENDDYrri:  Mask = (int8_t)0xFF; break;
-    case X86::VPBLENDWYrri:  Mask = (int8_t)0xFF; break;
+    default:
+      llvm_unreachable("Unreachable!");
+    case X86::BLENDPDrri:
+      Mask = (int8_t)0x03;
+      break;
+    case X86::BLENDPSrri:
+      Mask = (int8_t)0x0F;
+      break;
+    case X86::PBLENDWrri:
+      Mask = (int8_t)0xFF;
+      break;
+    case X86::VBLENDPDrri:
+      Mask = (int8_t)0x03;
+      break;
+    case X86::VBLENDPSrri:
+      Mask = (int8_t)0x0F;
+      break;
+    case X86::VBLENDPDYrri:
+      Mask = (int8_t)0x0F;
+      break;
+    case X86::VBLENDPSYrri:
+      Mask = (int8_t)0xFF;
+      break;
+    case X86::VPBLENDDrri:
+      Mask = (int8_t)0x0F;
+      break;
+    case X86::VPBLENDWrri:
+      Mask = (int8_t)0xFF;
+      break;
+    case X86::VPBLENDDYrri:
+      Mask = (int8_t)0xFF;
+      break;
+    case X86::VPBLENDWYrri:
+      Mask = (int8_t)0xFF;
+      break;
     }
     // Only the least significant bits of Imm are used.
     // Using int8_t to ensure it will be sign extended to the int64_t that
@@ -2157,16 +2430,29 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::MOVSDrr:
   case X86::MOVSSrr:
   case X86::VMOVSDrr:
-  case X86::VMOVSSrr:{
+  case X86::VMOVSSrr: {
     // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
     if (Subtarget.hasSSE41()) {
       unsigned Mask, Opc;
       switch (MI.getOpcode()) {
-      default: llvm_unreachable("Unreachable!");
-      case X86::MOVSDrr:  Opc = X86::BLENDPDrri;  Mask = 0x02; break;
-      case X86::MOVSSrr:  Opc = X86::BLENDPSrri;  Mask = 0x0E; break;
-      case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
-      case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
+      default:
+        llvm_unreachable("Unreachable!");
+      case X86::MOVSDrr:
+        Opc = X86::BLENDPDrri;
+        Mask = 0x02;
+        break;
+      case X86::MOVSSrr:
+        Opc = X86::BLENDPSrri;
+        Mask = 0x0E;
+        break;
+      case X86::VMOVSDrr:
+        Opc = X86::VBLENDPDrri;
+        Mask = 0x02;
+        break;
+      case X86::VMOVSSrr:
+        Opc = X86::VBLENDPSrri;
+        Mask = 0x0E;
+        break;
       }
 
       auto &WorkingMI = cloneIfNew(MI);
@@ -2211,30 +2497,54 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
-  case X86::VPCMPBZ128rri:  case X86::VPCMPUBZ128rri:
-  case X86::VPCMPBZ256rri:  case X86::VPCMPUBZ256rri:
-  case X86::VPCMPBZrri:     case X86::VPCMPUBZrri:
-  case X86::VPCMPDZ128rri:  case X86::VPCMPUDZ128rri:
-  case X86::VPCMPDZ256rri:  case X86::VPCMPUDZ256rri:
-  case X86::VPCMPDZrri:     case X86::VPCMPUDZrri:
-  case X86::VPCMPQZ128rri:  case X86::VPCMPUQZ128rri:
-  case X86::VPCMPQZ256rri:  case X86::VPCMPUQZ256rri:
-  case X86::VPCMPQZrri:     case X86::VPCMPUQZrri:
-  case X86::VPCMPWZ128rri:  case X86::VPCMPUWZ128rri:
-  case X86::VPCMPWZ256rri:  case X86::VPCMPUWZ256rri:
-  case X86::VPCMPWZrri:     case X86::VPCMPUWZrri:
-  case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
-  case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
-  case X86::VPCMPBZrrik:    case X86::VPCMPUBZrrik:
-  case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
-  case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
-  case X86::VPCMPDZrrik:    case X86::VPCMPUDZrrik:
-  case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
-  case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
-  case X86::VPCMPQZrrik:    case X86::VPCMPUQZrrik:
-  case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
-  case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
-  case X86::VPCMPWZrrik:    case X86::VPCMPUWZrrik: {
+  case X86::VPCMPBZ128rri:
+  case X86::VPCMPUBZ128rri:
+  case X86::VPCMPBZ256rri:
+  case X86::VPCMPUBZ256rri:
+  case X86::VPCMPBZrri:
+  case X86::VPCMPUBZrri:
+  case X86::VPCMPDZ128rri:
+  case X86::VPCMPUDZ128rri:
+  case X86::VPCMPDZ256rri:
+  case X86::VPCMPUDZ256rri:
+  case X86::VPCMPDZrri:
+  case X86::VPCMPUDZrri:
+  case X86::VPCMPQZ128rri:
+  case X86::VPCMPUQZ128rri:
+  case X86::VPCMPQZ256rri:
+  case X86::VPCMPUQZ256rri:
+  case X86::VPCMPQZrri:
+  case X86::VPCMPUQZrri:
+  case X86::VPCMPWZ128rri:
+  case X86::VPCMPUWZ128rri:
+  case X86::VPCMPWZ256rri:
+  case X86::VPCMPUWZ256rri:
+  case X86::VPCMPWZrri:
+  case X86::VPCMPUWZrri:
+  case X86::VPCMPBZ128rrik:
+  case X86::VPCMPUBZ128rrik:
+  case X86::VPCMPBZ256rrik:
+  case X86::VPCMPUBZ256rrik:
+  case X86::VPCMPBZrrik:
+  case X86::VPCMPUBZrrik:
+  case X86::VPCMPDZ128rrik:
+  case X86::VPCMPUDZ128rrik:
+  case X86::VPCMPDZ256rrik:
+  case X86::VPCMPUDZ256rrik:
+  case X86::VPCMPDZrrik:
+  case X86::VPCMPUDZrrik:
+  case X86::VPCMPQZ128rrik:
+  case X86::VPCMPUQZ128rrik:
+  case X86::VPCMPQZ256rrik:
+  case X86::VPCMPUQZ256rrik:
+  case X86::VPCMPQZrrik:
+  case X86::VPCMPUQZrrik:
+  case X86::VPCMPWZ128rrik:
+  case X86::VPCMPUWZ128rrik:
+  case X86::VPCMPWZ256rrik:
+  case X86::VPCMPUWZ256rrik:
+  case X86::VPCMPWZrrik:
+  case X86::VPCMPUWZrrik: {
     // Flip comparison mode immediate (if necessary).
     unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
     Imm = X86::getSwappedVPCMPImm(Imm);
@@ -2243,10 +2553,14 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
-  case X86::VPCOMBri: case X86::VPCOMUBri:
-  case X86::VPCOMDri: case X86::VPCOMUDri:
-  case X86::VPCOMQri: case X86::VPCOMUQri:
-  case X86::VPCOMWri: case X86::VPCOMUWri: {
+  case X86::VPCOMBri:
+  case X86::VPCOMUBri:
+  case X86::VPCOMDri:
+  case X86::VPCOMUDri:
+  case X86::VPCOMQri:
+  case X86::VPCOMUQri:
+  case X86::VPCOMWri:
+  case X86::VPCOMUWri: {
     // Flip comparison mode immediate (if necessary).
     unsigned Imm = MI.getOperand(3).getImm() & 0x7;
     Imm = X86::getSwappedVPCOMImm(Imm);
@@ -2274,7 +2588,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VCMPPDZ256rrik:
   case X86::VCMPPSZ256rrik: {
     unsigned Imm =
-                MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
+        MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
     Imm = X86::getSwappedVCMPImm(Imm);
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
@@ -2302,20 +2616,35 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
 
     unsigned Opc = MI.getOpcode();
     switch (Opc) {
-    default: llvm_unreachable("Unreachable!");
-    case X86::MOVHLPSrr:       Opc = X86::UNPCKHPDrr;      break;
-    case X86::UNPCKHPDrr:      Opc = X86::MOVHLPSrr;       break;
-    case X86::VMOVHLPSrr:      Opc = X86::VUNPCKHPDrr;     break;
-    case X86::VUNPCKHPDrr:     Opc = X86::VMOVHLPSrr;      break;
-    case X86::VMOVHLPSZrr:     Opc = X86::VUNPCKHPDZ128rr; break;
-    case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr;     break;
+    default:
+      llvm_unreachable("Unreachable!");
+    case X86::MOVHLPSrr:
+      Opc = X86::UNPCKHPDrr;
+      break;
+    case X86::UNPCKHPDrr:
+      Opc = X86::MOVHLPSrr;
+      break;
+    case X86::VMOVHLPSrr:
+      Opc = X86::VUNPCKHPDrr;
+      break;
+    case X86::VUNPCKHPDrr:
+      Opc = X86::VMOVHLPSrr;
+      break;
+    case X86::VMOVHLPSZrr:
+      Opc = X86::VUNPCKHPDZ128rr;
+      break;
+    case X86::VUNPCKHPDZ128rr:
+      Opc = X86::VMOVHLPSZrr;
+      break;
     }
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.setDesc(get(Opc));
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
-  case X86::CMOV16rr:  case X86::CMOV32rr:  case X86::CMOV64rr: {
+  case X86::CMOV16rr:
+  case X86::CMOV32rr:
+  case X86::CMOV64rr: {
     auto &WorkingMI = cloneIfNew(MI);
     unsigned OpNo = MI.getDesc().getNumOperands() - 1;
     X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
@@ -2323,24 +2652,36 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
-  case X86::VPTERNLOGDZrri:      case X86::VPTERNLOGDZrmi:
-  case X86::VPTERNLOGDZ128rri:   case X86::VPTERNLOGDZ128rmi:
-  case X86::VPTERNLOGDZ256rri:   case X86::VPTERNLOGDZ256rmi:
-  case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
-  case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
-  case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
+  case X86::VPTERNLOGDZrri:
+  case X86::VPTERNLOGDZrmi:
+  case X86::VPTERNLOGDZ128rri:
+  case X86::VPTERNLOGDZ128rmi:
+  case X86::VPTERNLOGDZ256rri:
+  case X86::VPTERNLOGDZ256rmi:
+  case X86::VPTERNLOGQZrri:
+  case X86::VPTERNLOGQZrmi:
+  case X86::VPTERNLOGQZ128rri:
+  case X86::VPTERNLOGQZ128rmi:
+  case X86::VPTERNLOGQZ256rri:
+  case X86::VPTERNLOGQZ256rmi:
   case X86::VPTERNLOGDZrrik:
   case X86::VPTERNLOGDZ128rrik:
   case X86::VPTERNLOGDZ256rrik:
   case X86::VPTERNLOGQZrrik:
   case X86::VPTERNLOGQZ128rrik:
   case X86::VPTERNLOGQZ256rrik:
-  case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
-  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
-  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
-  case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
-  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
-  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+  case X86::VPTERNLOGDZrrikz:
+  case X86::VPTERNLOGDZrmikz:
+  case X86::VPTERNLOGDZ128rrikz:
+  case X86::VPTERNLOGDZ128rmikz:
+  case X86::VPTERNLOGDZ256rrikz:
+  case X86::VPTERNLOGDZ256rmikz:
+  case X86::VPTERNLOGQZrrikz:
+  case X86::VPTERNLOGQZrmikz:
+  case X86::VPTERNLOGQZ128rrikz:
+  case X86::VPTERNLOGQZ128rmikz:
+  case X86::VPTERNLOGQZ256rrikz:
+  case X86::VPTERNLOGQZ256rmikz:
   case X86::VPTERNLOGDZ128rmbi:
   case X86::VPTERNLOGDZ256rmbi:
   case X86::VPTERNLOGDZrmbi:
@@ -2367,11 +2708,11 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                                      OpIdx1, OpIdx2);
     }
 
-    const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
-                                                      MI.getDesc().TSFlags);
+    const X86InstrFMA3Group *FMA3Group =
+        getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
     if (FMA3Group) {
       unsigned Opc =
-        getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
+          getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
       auto &WorkingMI = cloneIfNew(MI);
       WorkingMI.setDesc(get(Opc));
       return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
@@ -2383,11 +2724,10 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   }
 }
 
-bool
-X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
-                                            unsigned &SrcOpIdx1,
-                                            unsigned &SrcOpIdx2,
-                                            bool IsIntrinsic) const {
+bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+                                                 unsigned &SrcOpIdx1,
+                                                 unsigned &SrcOpIdx2,
+                                                 bool IsIntrinsic) const {
   uint64_t TSFlags = MI.getDesc().TSFlags;
 
   unsigned FirstCommutableVecOp = 1;
@@ -2479,8 +2819,8 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
 
     // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
     // to return those values.
-    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
-                              CommutableOpIdx1, CommutableOpIdx2))
+    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
+                              CommutableOpIdx2))
       return false;
   }
 
@@ -2568,24 +2908,36 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
     if (Subtarget.hasSSE2())
       return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
     return false;
-  case X86::VPTERNLOGDZrri:      case X86::VPTERNLOGDZrmi:
-  case X86::VPTERNLOGDZ128rri:   case X86::VPTERNLOGDZ128rmi:
-  case X86::VPTERNLOGDZ256rri:   case X86::VPTERNLOGDZ256rmi:
-  case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
-  case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
-  case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
+  case X86::VPTERNLOGDZrri:
+  case X86::VPTERNLOGDZrmi:
+  case X86::VPTERNLOGDZ128rri:
+  case X86::VPTERNLOGDZ128rmi:
+  case X86::VPTERNLOGDZ256rri:
+  case X86::VPTERNLOGDZ256rmi:
+  case X86::VPTERNLOGQZrri:
+  case X86::VPTERNLOGQZrmi:
+  case X86::VPTERNLOGQZ128rri:
+  case X86::VPTERNLOGQZ128rmi:
+  case X86::VPTERNLOGQZ256rri:
+  case X86::VPTERNLOGQZ256rmi:
   case X86::VPTERNLOGDZrrik:
   case X86::VPTERNLOGDZ128rrik:
   case X86::VPTERNLOGDZ256rrik:
   case X86::VPTERNLOGQZrrik:
   case X86::VPTERNLOGQZ128rrik:
   case X86::VPTERNLOGQZ256rrik:
-  case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
-  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
-  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
-  case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
-  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
-  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+  case X86::VPTERNLOGDZrrikz:
+  case X86::VPTERNLOGDZrmikz:
+  case X86::VPTERNLOGDZ128rrikz:
+  case X86::VPTERNLOGDZ128rmikz:
+  case X86::VPTERNLOGDZ256rrikz:
+  case X86::VPTERNLOGDZ256rmikz:
+  case X86::VPTERNLOGQZrrikz:
+  case X86::VPTERNLOGQZrmikz:
+  case X86::VPTERNLOGQZ128rrikz:
+  case X86::VPTERNLOGQZ128rmikz:
+  case X86::VPTERNLOGQZ256rrikz:
+  case X86::VPTERNLOGQZ256rmikz:
   case X86::VPTERNLOGDZ128rmbi:
   case X86::VPTERNLOGDZ256rmbi:
   case X86::VPTERNLOGDZrmbi:
@@ -2674,19 +3026,18 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
       ++CommutableOpIdx1;
       ++CommutableOpIdx2;
     }
-    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
-                              CommutableOpIdx1, CommutableOpIdx2))
+    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
+                              CommutableOpIdx2))
       return false;
-    if (!MI.getOperand(SrcOpIdx1).isReg() ||
-        !MI.getOperand(SrcOpIdx2).isReg())
+    if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
       // No idea.
       return false;
     return true;
   }
 
   default:
-    const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
-                                                      MI.getDesc().TSFlags);
+    const X86InstrFMA3Group *FMA3Group =
+        getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
     if (FMA3Group)
       return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
                                            FMA3Group->isIntrinsic());
@@ -2714,8 +3065,8 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
         }
       }
 
-      if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
-                                CommutableOpIdx1, CommutableOpIdx2))
+      if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
+                                CommutableOpIdx2))
         return false;
 
       if (!MI.getOperand(SrcOpIdx1).isReg() ||
@@ -2819,25 +3170,44 @@ X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
 /// e.g. turning COND_E to COND_NE.
 X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
   switch (CC) {
-  default: llvm_unreachable("Illegal condition code!");
-  case X86::COND_E:  return X86::COND_NE;
-  case X86::COND_NE: return X86::COND_E;
-  case X86::COND_L:  return X86::COND_GE;
-  case X86::COND_LE: return X86::COND_G;
-  case X86::COND_G:  return X86::COND_LE;
-  case X86::COND_GE: return X86::COND_L;
-  case X86::COND_B:  return X86::COND_AE;
-  case X86::COND_BE: return X86::COND_A;
-  case X86::COND_A:  return X86::COND_BE;
-  case X86::COND_AE: return X86::COND_B;
-  case X86::COND_S:  return X86::COND_NS;
-  case X86::COND_NS: return X86::COND_S;
-  case X86::COND_P:  return X86::COND_NP;
-  case X86::COND_NP: return X86::COND_P;
-  case X86::COND_O:  return X86::COND_NO;
-  case X86::COND_NO: return X86::COND_O;
-  case X86::COND_NE_OR_P:  return X86::COND_E_AND_NP;
-  case X86::COND_E_AND_NP: return X86::COND_NE_OR_P;
+  default:
+    llvm_unreachable("Illegal condition code!");
+  case X86::COND_E:
+    return X86::COND_NE;
+  case X86::COND_NE:
+    return X86::COND_E;
+  case X86::COND_L:
+    return X86::COND_GE;
+  case X86::COND_LE:
+    return X86::COND_G;
+  case X86::COND_G:
+    return X86::COND_LE;
+  case X86::COND_GE:
+    return X86::COND_L;
+  case X86::COND_B:
+    return X86::COND_AE;
+  case X86::COND_BE:
+    return X86::COND_A;
+  case X86::COND_A:
+    return X86::COND_BE;
+  case X86::COND_AE:
+    return X86::COND_B;
+  case X86::COND_S:
+    return X86::COND_NS;
+  case X86::COND_NS:
+    return X86::COND_S;
+  case X86::COND_P:
+    return X86::COND_NP;
+  case X86::COND_NP:
+    return X86::COND_P;
+  case X86::COND_O:
+    return X86::COND_NO;
+  case X86::COND_NO:
+    return X86::COND_O;
+  case X86::COND_NE_OR_P:
+    return X86::COND_E_AND_NP;
+  case X86::COND_E_AND_NP:
+    return X86::COND_NE_OR_P;
   }
 }
 
@@ -2845,17 +3215,28 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
 /// modify the instructions such that flags are set by MI(b,a).
 static X86::CondCode getSwappedCondition(X86::CondCode CC) {
   switch (CC) {
-  default: return X86::COND_INVALID;
-  case X86::COND_E:  return X86::COND_E;
-  case X86::COND_NE: return X86::COND_NE;
-  case X86::COND_L:  return X86::COND_G;
-  case X86::COND_LE: return X86::COND_GE;
-  case X86::COND_G:  return X86::COND_L;
-  case X86::COND_GE: return X86::COND_LE;
-  case X86::COND_B:  return X86::COND_A;
-  case X86::COND_BE: return X86::COND_AE;
-  case X86::COND_A:  return X86::COND_B;
-  case X86::COND_AE: return X86::COND_BE;
+  default:
+    return X86::COND_INVALID;
+  case X86::COND_E:
+    return X86::COND_E;
+  case X86::COND_NE:
+    return X86::COND_NE;
+  case X86::COND_L:
+    return X86::COND_G;
+  case X86::COND_LE:
+    return X86::COND_GE;
+  case X86::COND_G:
+    return X86::COND_L;
+  case X86::COND_GE:
+    return X86::COND_LE;
+  case X86::COND_B:
+    return X86::COND_A;
+  case X86::COND_BE:
+    return X86::COND_AE;
+  case X86::COND_A:
+    return X86::COND_B;
+  case X86::COND_AE:
+    return X86::COND_BE;
   }
 }
 
@@ -2864,34 +3245,82 @@ X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
   X86::CondCode CC = X86::COND_INVALID;
   bool NeedSwap = false;
   switch (Predicate) {
-  default: break;
+  default:
+    break;
   // Floating-point Predicates
-  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
-  case CmpInst::FCMP_OLT: NeedSwap = true;        [[fallthrough]];
-  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
-  case CmpInst::FCMP_OLE: NeedSwap = true;        [[fallthrough]];
-  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
-  case CmpInst::FCMP_UGT: NeedSwap = true;        [[fallthrough]];
-  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
-  case CmpInst::FCMP_UGE: NeedSwap = true;        [[fallthrough]];
-  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
-  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
-  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
-  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
-  case CmpInst::FCMP_OEQ:                         [[fallthrough]];
-  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+  case CmpInst::FCMP_UEQ:
+    CC = X86::COND_E;
+    break;
+  case CmpInst::FCMP_OLT:
+    NeedSwap = true;
+    [[fallthrough]];
+  case CmpInst::FCMP_OGT:
+    CC = X86::COND_A;
+    break;
+  case CmpInst::FCMP_OLE:
+    NeedSwap = true;
+    [[fallthrough]];
+  case CmpInst::FCMP_OGE:
+    CC = X86::COND_AE;
+    break;
+  case CmpInst::FCMP_UGT:
+    NeedSwap = true;
+    [[fallthrough]];
+  case CmpInst::FCMP_ULT:
+    CC = X86::COND_B;
+    break;
+  case CmpInst::FCMP_UGE:
+    NeedSwap = true;
+    [[fallthrough]];
+  case CmpInst::FCMP_ULE:
+    CC = X86::COND_BE;
+    break;
+  case CmpInst::FCMP_ONE:
+    CC = X86::COND_NE;
+    break;
+  case CmpInst::FCMP_UNO:
+    CC = X86::COND_P;
+    break;
+  case CmpInst::FCMP_ORD:
+    CC = X86::COND_NP;
+    break;
+  case CmpInst::FCMP_OEQ:
+    [[fallthrough]];
+  case CmpInst::FCMP_UNE:
+    CC = X86::COND_INVALID;
+    break;
 
   // Integer Predicates
-  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
-  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
-  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
-  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
-  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
-  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
-  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
-  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
-  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
-  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
+  case CmpInst::ICMP_EQ:
+    CC = X86::COND_E;
+    break;
+  case CmpInst::ICMP_NE:
+    CC = X86::COND_NE;
+    break;
+  case CmpInst::ICMP_UGT:
+    CC = X86::COND_A;
+    break;
+  case CmpInst::ICMP_UGE:
+    CC = X86::COND_AE;
+    break;
+  case CmpInst::ICMP_ULT:
+    CC = X86::COND_B;
+    break;
+  case CmpInst::ICMP_ULE:
+    CC = X86::COND_BE;
+    break;
+  case CmpInst::ICMP_SGT:
+    CC = X86::COND_G;
+    break;
+  case CmpInst::ICMP_SGE:
+    CC = X86::COND_GE;
+    break;
+  case CmpInst::ICMP_SLT:
+    CC = X86::COND_L;
+    break;
+  case CmpInst::ICMP_SLE:
+    CC = X86::COND_LE;
+    break;
   }
 
   return std::make_pair(CC, NeedSwap);
@@ -2899,39 +3328,59 @@ X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
 
 /// Return a cmov opcode for the given register size in bytes, and operand type.
 unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
-  switch(RegBytes) {
-  default: llvm_unreachable("Illegal register size!");
-  case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
-  case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
-  case 8: return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
+  switch (RegBytes) {
+  default:
+    llvm_unreachable("Illegal register size!");
+  case 2:
+    return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
+  case 4:
+    return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
+  case 8:
+    return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
   }
 }
 
 /// Get the VPCMP immediate for the given condition.
 unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
   switch (CC) {
-  default: llvm_unreachable("Unexpected SETCC condition");
-  case ISD::SETNE:  return 4;
-  case ISD::SETEQ:  return 0;
+  default:
+    llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:
+    return 4;
+  case ISD::SETEQ:
+    return 0;
   case ISD::SETULT:
-  case ISD::SETLT: return 1;
+  case ISD::SETLT:
+    return 1;
   case ISD::SETUGT:
-  case ISD::SETGT: return 6;
+  case ISD::SETGT:
+    return 6;
   case ISD::SETUGE:
-  case ISD::SETGE: return 5;
+  case ISD::SETGE:
+    return 5;
   case ISD::SETULE:
-  case ISD::SETLE: return 2;
+  case ISD::SETLE:
+    return 2;
   }
 }
 
 /// Get the VPCMP immediate if the operands are swapped.
 unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
   switch (Imm) {
-  default: llvm_unreachable("Unreachable!");
-  case 0x01: Imm = 0x06; break; // LT  -> NLE
-  case 0x02: Imm = 0x05; break; // LE  -> NLT
-  case 0x05: Imm = 0x02; break; // NLT -> LE
-  case 0x06: Imm = 0x01; break; // NLE -> LT
+  default:
+    llvm_unreachable("Unreachable!");
+  case 0x01:
+    Imm = 0x06;
+    break; // LT  -> NLE
+  case 0x02:
+    Imm = 0x05;
+    break; // LE  -> NLT
+  case 0x05:
+    Imm = 0x02;
+    break; // NLT -> LE
+  case 0x06:
+    Imm = 0x01;
+    break;   // NLE -> LT
   case 0x00: // EQ
   case 0x03: // FALSE
   case 0x04: // NE
@@ -2945,11 +3394,20 @@ unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
 /// Get the VPCOM immediate if the operands are swapped.
 unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
   switch (Imm) {
-  default: llvm_unreachable("Unreachable!");
-  case 0x00: Imm = 0x02; break; // LT -> GT
-  case 0x01: Imm = 0x03; break; // LE -> GE
-  case 0x02: Imm = 0x00; break; // GT -> LT
-  case 0x03: Imm = 0x01; break; // GE -> LE
+  default:
+    llvm_unreachable("Unreachable!");
+  case 0x00:
+    Imm = 0x02;
+    break; // LT -> GT
+  case 0x01:
+    Imm = 0x03;
+    break; // LE -> GE
+  case 0x02:
+    Imm = 0x00;
+    break; // GT -> LT
+  case 0x03:
+    Imm = 0x01;
+    break;   // GE -> LE
   case 0x04: // EQ
   case 0x05: // NE
   case 0x06: // FALSE
@@ -2964,11 +3422,14 @@ unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
 unsigned X86::getSwappedVCMPImm(unsigned Imm) {
   // Only need the lower 2 bits to distinquish.
   switch (Imm & 0x3) {
-  default: llvm_unreachable("Unreachable!");
-  case 0x00: case 0x03:
+  default:
+    llvm_unreachable("Unreachable!");
+  case 0x00:
+  case 0x03:
     // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
     break;
-  case 0x01: case 0x02:
+  case 0x01:
+  case 0x02:
     // Need to toggle bits 3:0. Bit 4 stays the same.
     Imm ^= 0xf;
     break;
@@ -3078,9 +3539,9 @@ void X86InstrInfo::replaceBranchWithTailCall(
 
   auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
   MIB->addOperand(TailCall.getOperand(0)); // Destination.
-  MIB.addImm(0); // Stack offset (not used).
-  MIB->addOperand(BranchCond[0]); // Condition.
-  MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
+  MIB.addImm(0);                           // Stack offset (not used).
+  MIB->addOperand(BranchCond[0]);          // Condition.
+  MIB.copyImplicitOps(TailCall);           // Regmask and (imp-used) parameters.
 
   // Add implicit uses and defs of all live regs potentially clobbered by the
   // call. This way they still appear live across the call.
@@ -3173,7 +3634,7 @@ bool X86InstrInfo::AnalyzeBranchImpl(
     // Handle conditional branches.
     X86::CondCode BranchCode = X86::getCondFromBranch(*I);
     if (BranchCode == X86::COND_INVALID)
-      return true;  // Can't handle indirect branch.
+      return true; // Can't handle indirect branch.
 
     // In practice we should never have an undef eflags operand, if we do
     // abort here as we are not prepared to preserve the flag.
@@ -3205,8 +3666,8 @@ bool X86InstrInfo::AnalyzeBranchImpl(
     // we could handle more patterns here, but we shouldn't expect to see them
     // if instruction selection has done a reasonable job.
     if (TBB == NewTBB &&
-               ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
-                (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
+        ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
+         (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
       BranchCode = X86::COND_NE_OR_P;
     } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
                (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
@@ -3408,8 +3869,7 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *TBB,
                                     MachineBasicBlock *FBB,
                                     ArrayRef<MachineOperand> Cond,
-                                    const DebugLoc &DL,
-                                    int *BytesAdded) const {
+                                    const DebugLoc &DL, int *BytesAdded) const {
   // Shouldn't be a fall through.
   assert(TBB && "insertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 1 || Cond.size() == 0) &&
@@ -3480,7 +3940,7 @@ bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
   // Check register classes.
   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *RC =
-    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
   if (!RC)
     return false;
 
@@ -3532,7 +3992,8 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   // SrcReg(MaskReg) -> DestReg(GR64)
   // SrcReg(MaskReg) -> DestReg(GR32)
 
-  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+  // All KMASK RegClasses hold the same k registers, can be tested against
+  // anyone.
   if (X86::VK16RegClass.contains(SrcReg)) {
     if (X86::GR64RegClass.contains(DestReg)) {
       assert(Subtarget.hasBWI());
@@ -3546,7 +4007,8 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   // SrcReg(GR64) -> DestReg(MaskReg)
   // SrcReg(GR32) -> DestReg(MaskReg)
 
-  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+  // All KMASK RegClasses hold the same k registers, can be tested against
+  // anyone.
   if (X86::VK16RegClass.contains(DestReg)) {
     if (X86::GR64RegClass.contains(SrcReg)) {
       assert(Subtarget.hasBWI());
@@ -3557,7 +4019,6 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
                                 : (HasEGPR ? X86::KMOVWkr_EVEX : X86::KMOVWkr);
   }
 
-
   // SrcReg(VR128) -> DestReg(GR64)
   // SrcReg(VR64)  -> DestReg(GR64)
   // SrcReg(GR64)  -> DestReg(VR128)
@@ -3566,18 +4027,18 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   if (X86::GR64RegClass.contains(DestReg)) {
     if (X86::VR128XRegClass.contains(SrcReg))
       // Copy from a VR128 register to a GR64 register.
-      return HasAVX512 ? X86::VMOVPQIto64Zrr :
-             HasAVX    ? X86::VMOVPQIto64rr  :
-                         X86::MOVPQIto64rr;
+      return HasAVX512 ? X86::VMOVPQIto64Zrr
+             : HasAVX  ? X86::VMOVPQIto64rr
+                       : X86::MOVPQIto64rr;
     if (X86::VR64RegClass.contains(SrcReg))
       // Copy from a VR64 register to a GR64 register.
       return X86::MMX_MOVD64from64rr;
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
     if (X86::VR128XRegClass.contains(DestReg))
-      return HasAVX512 ? X86::VMOV64toPQIZrr :
-             HasAVX    ? X86::VMOV64toPQIrr  :
-                         X86::MOV64toPQIrr;
+      return HasAVX512 ? X86::VMOV64toPQIZrr
+             : HasAVX  ? X86::VMOV64toPQIrr
+                       : X86::MOV64toPQIrr;
     // Copy from a GR64 register to a VR64 register.
     if (X86::VR64RegClass.contains(DestReg))
       return X86::MMX_MOVD64to64rr;
@@ -3589,16 +4050,16 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   if (X86::GR32RegClass.contains(DestReg) &&
       X86::VR128XRegClass.contains(SrcReg))
     // Copy from a VR128 register to a GR32 register.
-    return HasAVX512 ? X86::VMOVPDI2DIZrr :
-           HasAVX    ? X86::VMOVPDI2DIrr  :
-                       X86::MOVPDI2DIrr;
+    return HasAVX512 ? X86::VMOVPDI2DIZrr
+           : HasAVX  ? X86::VMOVPDI2DIrr
+                     : X86::MOVPDI2DIrr;
 
   if (X86::VR128XRegClass.contains(DestReg) &&
       X86::GR32RegClass.contains(SrcReg))
     // Copy from a VR128 register to a VR128 register.
-    return HasAVX512 ? X86::VMOVDI2PDIZrr :
-           HasAVX    ? X86::VMOVDI2PDIrr  :
-                       X86::MOVDI2PDIrr;
+    return HasAVX512 ? X86::VMOVDI2PDIZrr
+           : HasAVX  ? X86::VMOVDI2PDIrr
+                     : X86::MOVDI2PDIrr;
   return 0;
 }
 
@@ -3619,16 +4080,14 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
     // Copying to or from a physical H register on x86-64 requires a NOREX
     // move.  Otherwise use a normal move.
-    if ((isHReg(DestReg) || isHReg(SrcReg)) &&
-        Subtarget.is64Bit()) {
+    if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) {
       Opc = X86::MOV8rr_NOREX;
       // Both operands must be encodable without an REX prefix.
       assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
              "8-bit H register can not be copied outside GR8_NOREX");
     } else
       Opc = X86::MOV8rr;
-  }
-  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+  } else if (X86::VR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MMX_MOVQ64rr;
   else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
     if (HasVLX)
@@ -3640,10 +4099,10 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       // 512-bit move.
       Opc = X86::VMOVAPSZrr;
       const TargetRegisterInfo *TRI = &getRegisterInfo();
-      DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
-                                         &X86::VR512RegClass);
-      SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
-                                        &X86::VR512RegClass);
+      DestReg =
+          TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass);
+      SrcReg =
+          TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
     }
   } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
     if (HasVLX)
@@ -3655,14 +4114,15 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       // 512-bit move.
       Opc = X86::VMOVAPSZrr;
       const TargetRegisterInfo *TRI = &getRegisterInfo();
-      DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
-                                         &X86::VR512RegClass);
-      SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
-                                        &X86::VR512RegClass);
+      DestReg =
+          TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass);
+      SrcReg =
+          TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
     }
   } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
     Opc = X86::VMOVAPSZrr;
-  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+  // All KMASK RegClasses hold the same k registers, can be tested against
+  // anyone.
   else if (X86::VK16RegClass.contains(DestReg, SrcReg))
     Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
   if (!Opc)
@@ -3670,7 +4130,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (Opc) {
     BuildMI(MBB, MI, DL, get(Opc), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
+        .addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
 
@@ -3745,13 +4205,12 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
     if (X86::GR32RegClass.hasSubClassEq(RC))
       return Load ? X86::MOV32rm : X86::MOV32mr;
     if (X86::FR32XRegClass.hasSubClassEq(RC))
-      return Load ?
-        (HasAVX512 ? X86::VMOVSSZrm_alt :
-         HasAVX    ? X86::VMOVSSrm_alt :
-                     X86::MOVSSrm_alt) :
-        (HasAVX512 ? X86::VMOVSSZmr :
-         HasAVX    ? X86::VMOVSSmr :
-                     X86::MOVSSmr);
+      return Load ? (HasAVX512 ? X86::VMOVSSZrm_alt
+                     : HasAVX  ? X86::VMOVSSrm_alt
+                               : X86::MOVSSrm_alt)
+                  : (HasAVX512 ? X86::VMOVSSZmr
+                     : HasAVX  ? X86::VMOVSSmr
+                               : X86::MOVSSmr);
     if (X86::RFP32RegClass.hasSubClassEq(RC))
       return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
     if (X86::VK32RegClass.hasSubClassEq(RC)) {
@@ -3775,13 +4234,12 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
     if (X86::GR64RegClass.hasSubClassEq(RC))
       return Load ? X86::MOV64rm : X86::MOV64mr;
     if (X86::FR64XRegClass.hasSubClassEq(RC))
-      return Load ?
-        (HasAVX512 ? X86::VMOVSDZrm_alt :
-         HasAVX    ? X86::VMOVSDrm_alt :
-                     X86::MOVSDrm_alt) :
-        (HasAVX512 ? X86::VMOVSDZmr :
-         HasAVX    ? X86::VMOVSDmr :
-                     X86::MOVSDmr);
+      return Load ? (HasAVX512 ? X86::VMOVSDZrm_alt
+                     : HasAVX  ? X86::VMOVSDrm_alt
+                               : X86::MOVSDrm_alt)
+                  : (HasAVX512 ? X86::VMOVSDZmr
+                     : HasAVX  ? X86::VMOVSDmr
+                               : X86::MOVSDmr);
     if (X86::VR64RegClass.hasSubClassEq(RC))
       return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
     if (X86::RFP64RegClass.hasSubClassEq(RC))
@@ -3799,25 +4257,23 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
     if (X86::VR128XRegClass.hasSubClassEq(RC)) {
       // If stack is realigned we can use aligned stores.
       if (IsStackAligned)
-        return Load ?
-          (HasVLX    ? X86::VMOVAPSZ128rm :
-           HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
-           HasAVX    ? X86::VMOVAPSrm :
-                       X86::MOVAPSrm):
-          (HasVLX    ? X86::VMOVAPSZ128mr :
-           HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
-           HasAVX    ? X86::VMOVAPSmr :
-                       X86::MOVAPSmr);
+        return Load ? (HasVLX      ? X86::VMOVAPSZ128rm
+                       : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX
+                       : HasAVX    ? X86::VMOVAPSrm
+                                   : X86::MOVAPSrm)
+                    : (HasVLX      ? X86::VMOVAPSZ128mr
+                       : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX
+                       : HasAVX    ? X86::VMOVAPSmr
+                                   : X86::MOVAPSmr);
       else
-        return Load ?
-          (HasVLX    ? X86::VMOVUPSZ128rm :
-           HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
-           HasAVX    ? X86::VMOVUPSrm :
-                       X86::MOVUPSrm):
-          (HasVLX    ? X86::VMOVUPSZ128mr :
-           HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
-           HasAVX    ? X86::VMOVUPSmr :
-                       X86::MOVUPSmr);
+        return Load ? (HasVLX      ? X86::VMOVUPSZ128rm
+                       : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX
+                       : HasAVX    ? X86::VMOVUPSrm
+                                   : X86::MOVUPSrm)
+                    : (HasVLX      ? X86::VMOVUPSZ128mr
+                       : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX
+                       : HasAVX    ? X86::VMOVUPSmr
+                                   : X86::MOVUPSmr);
     }
     llvm_unreachable("Unknown 16-byte regclass");
   }
@@ -3825,21 +4281,19 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
     assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (IsStackAligned)
-      return Load ?
-        (HasVLX    ? X86::VMOVAPSZ256rm :
-         HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
-                     X86::VMOVAPSYrm) :
-        (HasVLX    ? X86::VMOVAPSZ256mr :
-         HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
-                     X86::VMOVAPSYmr);
+      return Load ? (HasVLX      ? X86::VMOVAPSZ256rm
+                     : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
+                                 : X86::VMOVAPSYrm)
+                  : (HasVLX      ? X86::VMOVAPSZ256mr
+                     : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
+                                 : X86::VMOVAPSYmr);
     else
-      return Load ?
-        (HasVLX    ? X86::VMOVUPSZ256rm :
-         HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
-                     X86::VMOVUPSYrm) :
-        (HasVLX    ? X86::VMOVUPSZ256mr :
-         HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
-                     X86::VMOVUPSYmr);
+      return Load ? (HasVLX      ? X86::VMOVUPSZ256rm
+                     : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
+                                 : X86::VMOVUPSYrm)
+                  : (HasVLX      ? X86::VMOVUPSZ256mr
+                     : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
+                                 : X86::VMOVUPSYmr);
   case 64:
     assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
     assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
@@ -4131,7 +4585,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
                                   Register &SrcReg2, int64_t &CmpMask,
                                   int64_t &CmpValue) const {
   switch (MI.getOpcode()) {
-  default: break;
+  default:
+    break;
   case X86::CMP64ri32:
   case X86::CMP32ri:
   case X86::CMP16ri:
@@ -4294,104 +4749,225 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
   }
 
   switch (MI.getOpcode()) {
-  default: return false;
+  default:
+    return false;
 
   // The shift instructions only modify ZF if their shift count is non-zero.
   // N.B.: The processor truncates the shift count depending on the encoding.
-  case X86::SAR8ri:    case X86::SAR16ri:  case X86::SAR32ri:case X86::SAR64ri:
-  case X86::SHR8ri:    case X86::SHR16ri:  case X86::SHR32ri:case X86::SHR64ri:
-     return getTruncatedShiftCount(MI, 2) != 0;
+  case X86::SAR8ri:
+  case X86::SAR16ri:
+  case X86::SAR32ri:
+  case X86::SAR64ri:
+  case X86::SHR8ri:
+  case X86::SHR16ri:
+  case X86::SHR32ri:
+  case X86::SHR64ri:
+    return getTruncatedShiftCount(MI, 2) != 0;
 
   // Some left shift instructions can be turned into LEA instructions but only
   // if their flags aren't used. Avoid transforming such instructions.
-  case X86::SHL8ri:    case X86::SHL16ri:  case X86::SHL32ri:case X86::SHL64ri:{
+  case X86::SHL8ri:
+  case X86::SHL16ri:
+  case X86::SHL32ri:
+  case X86::SHL64ri: {
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (isTruncatedShiftCountForLEA(ShAmt)) return false;
+    if (isTruncatedShiftCountForLEA(ShAmt))
+      return false;
     return ShAmt != 0;
   }
 
-  case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
-  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
-     return getTruncatedShiftCount(MI, 3) != 0;
-
-  case X86::SUB64ri32: case X86::SUB32ri:  case X86::SUB16ri:
-  case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
-  case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
-  case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
-  case X86::DEC64r:    case X86::DEC32r:   case X86::DEC16r: case X86::DEC8r:
-  case X86::ADD64ri32: case X86::ADD32ri:  case X86::ADD16ri:
-  case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
-  case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
-  case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
-  case X86::INC64r:    case X86::INC32r:   case X86::INC16r: case X86::INC8r:
-  case X86::ADC64ri32: case X86::ADC32ri:  case X86::ADC16ri:
-  case X86::ADC8ri:    case X86::ADC64rr:  case X86::ADC32rr:
-  case X86::ADC16rr:   case X86::ADC8rr:   case X86::ADC64rm:
-  case X86::ADC32rm:   case X86::ADC16rm:  case X86::ADC8rm:
-  case X86::SBB64ri32: case X86::SBB32ri:  case X86::SBB16ri:
-  case X86::SBB8ri:    case X86::SBB64rr:  case X86::SBB32rr:
-  case X86::SBB16rr:   case X86::SBB8rr:   case X86::SBB64rm:
-  case X86::SBB32rm:   case X86::SBB16rm:  case X86::SBB8rm:
-  case X86::NEG8r:     case X86::NEG16r:   case X86::NEG32r: case X86::NEG64r:
-  case X86::LZCNT16rr: case X86::LZCNT16rm:
-  case X86::LZCNT32rr: case X86::LZCNT32rm:
-  case X86::LZCNT64rr: case X86::LZCNT64rm:
-  case X86::POPCNT16rr:case X86::POPCNT16rm:
-  case X86::POPCNT32rr:case X86::POPCNT32rm:
-  case X86::POPCNT64rr:case X86::POPCNT64rm:
-  case X86::TZCNT16rr: case X86::TZCNT16rm:
-  case X86::TZCNT32rr: case X86::TZCNT32rm:
-  case X86::TZCNT64rr: case X86::TZCNT64rm:
+  case X86::SHRD16rri8:
+  case X86::SHRD32rri8:
+  case X86::SHRD64rri8:
+  case X86::SHLD16rri8:
+  case X86::SHLD32rri8:
+  case X86::SHLD64rri8:
+    return getTruncatedShiftCount(MI, 3) != 0;
+
+  case X86::SUB64ri32:
+  case X86::SUB32ri:
+  case X86::SUB16ri:
+  case X86::SUB8ri:
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr:
+  case X86::SUB64rm:
+  case X86::SUB32rm:
+  case X86::SUB16rm:
+  case X86::SUB8rm:
+  case X86::DEC64r:
+  case X86::DEC32r:
+  case X86::DEC16r:
+  case X86::DEC8r:
+  case X86::ADD64ri32:
+  case X86::ADD32ri:
+  case X86::ADD16ri:
+  case X86::ADD8ri:
+  case X86::ADD64rr:
+  case X86::ADD32rr:
+  case X86::ADD16rr:
+  case X86::ADD8rr:
+  case X86::ADD64rm:
+  case X86::ADD32rm:
+  case X86::ADD16rm:
+  case X86::ADD8rm:
+  case X86::INC64r:
+  case X86::INC32r:
+  case X86::INC16r:
+  case X86::INC8r:
+  case X86::ADC64ri32:
+  case X86::ADC32ri:
+  case X86::ADC16ri:
+  case X86::ADC8ri:
+  case X86::ADC64rr:
+  case X86::ADC32rr:
+  case X86::ADC16rr:
+  case X86::ADC8rr:
+  case X86::ADC64rm:
+  case X86::ADC32rm:
+  case X86::ADC16rm:
+  case X86::ADC8rm:
+  case X86::SBB64ri32:
+  case X86::SBB32ri:
+  case X86::SBB16ri:
+  case X86::SBB8ri:
+  case X86::SBB64rr:
+  case X86::SBB32rr:
+  case X86::SBB16rr:
+  case X86::SBB8rr:
+  case X86::SBB64rm:
+  case X86::SBB32rm:
+  case X86::SBB16rm:
+  case X86::SBB8rm:
+  case X86::NEG8r:
+  case X86::NEG16r:
+  case X86::NEG32r:
+  case X86::NEG64r:
+  case X86::LZCNT16rr:
+  case X86::LZCNT16rm:
+  case X86::LZCNT32rr:
+  case X86::LZCNT32rm:
+  case X86::LZCNT64rr:
+  case X86::LZCNT64rm:
+  case X86::POPCNT16rr:
+  case X86::POPCNT16rm:
+  case X86::POPCNT32rr:
+  case X86::POPCNT32rm:
+  case X86::POPCNT64rr:
+  case X86::POPCNT64rm:
+  case X86::TZCNT16rr:
+  case X86::TZCNT16rm:
+  case X86::TZCNT32rr:
+  case X86::TZCNT32rm:
+  case X86::TZCNT64rr:
+  case X86::TZCNT64rm:
     return true;
-  case X86::AND64ri32:   case X86::AND32ri:   case X86::AND16ri:
-  case X86::AND8ri:      case X86::AND64rr:   case X86::AND32rr:
-  case X86::AND16rr:     case X86::AND8rr:    case X86::AND64rm:
-  case X86::AND32rm:     case X86::AND16rm:   case X86::AND8rm:
-  case X86::XOR64ri32:   case X86::XOR32ri:   case X86::XOR16ri:
-  case X86::XOR8ri:      case X86::XOR64rr:   case X86::XOR32rr:
-  case X86::XOR16rr:     case X86::XOR8rr:    case X86::XOR64rm:
-  case X86::XOR32rm:     case X86::XOR16rm:   case X86::XOR8rm:
-  case X86::OR64ri32:    case X86::OR32ri:    case X86::OR16ri:
-  case X86::OR8ri:       case X86::OR64rr:    case X86::OR32rr:
-  case X86::OR16rr:      case X86::OR8rr:     case X86::OR64rm:
-  case X86::OR32rm:      case X86::OR16rm:    case X86::OR8rm:
-  case X86::ANDN32rr:    case X86::ANDN32rm:
-  case X86::ANDN64rr:    case X86::ANDN64rm:
-  case X86::BLSI32rr:    case X86::BLSI32rm:
-  case X86::BLSI64rr:    case X86::BLSI64rm:
-  case X86::BLSMSK32rr:  case X86::BLSMSK32rm:
-  case X86::BLSMSK64rr:  case X86::BLSMSK64rm:
-  case X86::BLSR32rr:    case X86::BLSR32rm:
-  case X86::BLSR64rr:    case X86::BLSR64rm:
-  case X86::BLCFILL32rr: case X86::BLCFILL32rm:
-  case X86::BLCFILL64rr: case X86::BLCFILL64rm:
-  case X86::BLCI32rr:    case X86::BLCI32rm:
-  case X86::BLCI64rr:    case X86::BLCI64rm:
-  case X86::BLCIC32rr:   case X86::BLCIC32rm:
-  case X86::BLCIC64rr:   case X86::BLCIC64rm:
-  case X86::BLCMSK32rr:  case X86::BLCMSK32rm:
-  case X86::BLCMSK64rr:  case X86::BLCMSK64rm:
-  case X86::BLCS32rr:    case X86::BLCS32rm:
-  case X86::BLCS64rr:    case X86::BLCS64rm:
-  case X86::BLSFILL32rr: case X86::BLSFILL32rm:
-  case X86::BLSFILL64rr: case X86::BLSFILL64rm:
-  case X86::BLSIC32rr:   case X86::BLSIC32rm:
-  case X86::BLSIC64rr:   case X86::BLSIC64rm:
-  case X86::BZHI32rr:    case X86::BZHI32rm:
-  case X86::BZHI64rr:    case X86::BZHI64rm:
-  case X86::T1MSKC32rr:  case X86::T1MSKC32rm:
-  case X86::T1MSKC64rr:  case X86::T1MSKC64rm:
-  case X86::TZMSK32rr:   case X86::TZMSK32rm:
-  case X86::TZMSK64rr:   case X86::TZMSK64rm:
+  case X86::AND64ri32:
+  case X86::AND32ri:
+  case X86::AND16ri:
+  case X86::AND8ri:
+  case X86::AND64rr:
+  case X86::AND32rr:
+  case X86::AND16rr:
+  case X86::AND8rr:
+  case X86::AND64rm:
+  case X86::AND32rm:
+  case X86::AND16rm:
+  case X86::AND8rm:
+  case X86::XOR64ri32:
+  case X86::XOR32ri:
+  case X86::XOR16ri:
+  case X86::XOR8ri:
+  case X86::XOR64rr:
+  case X86::XOR32rr:
+  case X86::XOR16rr:
+  case X86::XOR8rr:
+  case X86::XOR64rm:
+  case X86::XOR32rm:
+  case X86::XOR16rm:
+  case X86::XOR8rm:
+  case X86::OR64ri32:
+  case X86::OR32ri:
+  case X86::OR16ri:
+  case X86::OR8ri:
+  case X86::OR64rr:
+  case X86::OR32rr:
+  case X86::OR16rr:
+  case X86::OR8rr:
+  case X86::OR64rm:
+  case X86::OR32rm:
+  case X86::OR16rm:
+  case X86::OR8rm:
+  case X86::ANDN32rr:
+  case X86::ANDN32rm:
+  case X86::ANDN64rr:
+  case X86::ANDN64rm:
+  case X86::BLSI32rr:
+  case X86::BLSI32rm:
+  case X86::BLSI64rr:
+  case X86::BLSI64rm:
+  case X86::BLSMSK32rr:
+  case X86::BLSMSK32rm:
+  case X86::BLSMSK64rr:
+  case X86::BLSMSK64rm:
+  case X86::BLSR32rr:
+  case X86::BLSR32rm:
+  case X86::BLSR64rr:
+  case X86::BLSR64rm:
+  case X86::BLCFILL32rr:
+  case X86::BLCFILL32rm:
+  case X86::BLCFILL64rr:
+  case X86::BLCFILL64rm:
+  case X86::BLCI32rr:
+  case X86::BLCI32rm:
+  case X86::BLCI64rr:
+  case X86::BLCI64rm:
+  case X86::BLCIC32rr:
+  case X86::BLCIC32rm:
+  case X86::BLCIC64rr:
+  case X86::BLCIC64rm:
+  case X86::BLCMSK32rr:
+  case X86::BLCMSK32rm:
+  case X86::BLCMSK64rr:
+  case X86::BLCMSK64rm:
+  case X86::BLCS32rr:
+  case X86::BLCS32rm:
+  case X86::BLCS64rr:
+  case X86::BLCS64rm:
+  case X86::BLSFILL32rr:
+  case X86::BLSFILL32rm:
+  case X86::BLSFILL64rr:
+  case X86::BLSFILL64rm:
+  case X86::BLSIC32rr:
+  case X86::BLSIC32rm:
+  case X86::BLSIC64rr:
+  case X86::BLSIC64rm:
+  case X86::BZHI32rr:
+  case X86::BZHI32rm:
+  case X86::BZHI64rr:
+  case X86::BZHI64rm:
+  case X86::T1MSKC32rr:
+  case X86::T1MSKC32rm:
+  case X86::T1MSKC64rr:
+  case X86::T1MSKC64rm:
+  case X86::TZMSK32rr:
+  case X86::TZMSK32rm:
+  case X86::TZMSK64rr:
+  case X86::TZMSK64rm:
     // These instructions clear the overflow flag just like TEST.
     // FIXME: These are not the only instructions in this switch that clear the
     // overflow flag.
     ClearsOverflowFlag = true;
     return true;
-  case X86::BEXTR32rr:   case X86::BEXTR64rr:
-  case X86::BEXTR32rm:   case X86::BEXTR64rm:
-  case X86::BEXTRI32ri:  case X86::BEXTRI32mi:
-  case X86::BEXTRI64ri:  case X86::BEXTRI64mi:
+  case X86::BEXTR32rr:
+  case X86::BEXTR64rr:
+  case X86::BEXTR32rm:
+  case X86::BEXTR64rm:
+  case X86::BEXTRI32ri:
+  case X86::BEXTRI32mi:
+  case X86::BEXTRI64ri:
+  case X86::BEXTRI64mi:
     // BEXTR doesn't update the sign flag so we can't use it. It does clear
     // the overflow flag, but that's not useful without the sign flag.
     NoSignFlag = true;
@@ -4402,7 +4978,8 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
 /// Check whether the use can be converted to remove a comparison against zero.
 static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
-  default: return X86::COND_INVALID;
+  default:
+    return X86::COND_INVALID;
   case X86::NEG8r:
   case X86::NEG16r:
   case X86::NEG32r:
@@ -4435,7 +5012,7 @@ static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
   case X86::BLSMSK32rr:
   case X86::BLSMSK64rr:
     return X86::COND_B;
-  // TODO: TBM instructions.
+    // TODO: TBM instructions.
   }
 }
 
@@ -4448,7 +5025,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
                                         const MachineRegisterInfo *MRI) const {
   // Check whether we can replace SUB with CMP.
   switch (CmpInstr.getOpcode()) {
-  default: break;
+  default:
+    break;
   case X86::SUB64ri32:
   case X86::SUB32ri:
   case X86::SUB16ri:
@@ -4466,19 +5044,44 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     // There is no use of the destination register, we can replace SUB with CMP.
     unsigned NewOpcode = 0;
     switch (CmpInstr.getOpcode()) {
-    default: llvm_unreachable("Unreachable!");
-    case X86::SUB64rm:   NewOpcode = X86::CMP64rm;   break;
-    case X86::SUB32rm:   NewOpcode = X86::CMP32rm;   break;
-    case X86::SUB16rm:   NewOpcode = X86::CMP16rm;   break;
-    case X86::SUB8rm:    NewOpcode = X86::CMP8rm;    break;
-    case X86::SUB64rr:   NewOpcode = X86::CMP64rr;   break;
-    case X86::SUB32rr:   NewOpcode = X86::CMP32rr;   break;
-    case X86::SUB16rr:   NewOpcode = X86::CMP16rr;   break;
-    case X86::SUB8rr:    NewOpcode = X86::CMP8rr;    break;
-    case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
-    case X86::SUB32ri:   NewOpcode = X86::CMP32ri;   break;
-    case X86::SUB16ri:   NewOpcode = X86::CMP16ri;   break;
-    case X86::SUB8ri:    NewOpcode = X86::CMP8ri;    break;
+    default:
+      llvm_unreachable("Unreachable!");
+    case X86::SUB64rm:
+      NewOpcode = X86::CMP64rm;
+      break;
+    case X86::SUB32rm:
+      NewOpcode = X86::CMP32rm;
+      break;
+    case X86::SUB16rm:
+      NewOpcode = X86::CMP16rm;
+      break;
+    case X86::SUB8rm:
+      NewOpcode = X86::CMP8rm;
+      break;
+    case X86::SUB64rr:
+      NewOpcode = X86::CMP64rr;
+      break;
+    case X86::SUB32rr:
+      NewOpcode = X86::CMP32rr;
+      break;
+    case X86::SUB16rr:
+      NewOpcode = X86::CMP16rr;
+      break;
+    case X86::SUB8rr:
+      NewOpcode = X86::CMP8rr;
+      break;
+    case X86::SUB64ri32:
+      NewOpcode = X86::CMP64ri32;
+      break;
+    case X86::SUB32ri:
+      NewOpcode = X86::CMP32ri;
+      break;
+    case X86::SUB16ri:
+      NewOpcode = X86::CMP16ri;
+      break;
+    case X86::SUB8ri:
+      NewOpcode = X86::CMP8ri;
+      break;
     }
     CmpInstr.setDesc(get(NewOpcode));
     CmpInstr.removeOperand(0);
@@ -4614,7 +5217,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   // If we are done with the basic block, we need to check whether EFLAGS is
   // live-out.
   bool FlagsMayLiveOut = true;
-  SmallVector<std::pair<MachineInstr*, X86::CondCode>, 4> OpsToUpdate;
+  SmallVector<std::pair<MachineInstr *, X86::CondCode>, 4> OpsToUpdate;
   MachineBasicBlock::iterator AfterCmpInstr =
       std::next(MachineBasicBlock::iterator(CmpInstr));
   for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
@@ -4637,24 +5240,31 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     X86::CondCode ReplacementCC = X86::COND_INVALID;
     if (MI) {
       switch (OldCC) {
-      default: break;
-      case X86::COND_A: case X86::COND_AE:
-      case X86::COND_B: case X86::COND_BE:
+      default:
+        break;
+      case X86::COND_A:
+      case X86::COND_AE:
+      case X86::COND_B:
+      case X86::COND_BE:
         // CF is used, we can't perform this optimization.
         return false;
-      case X86::COND_G: case X86::COND_GE:
-      case X86::COND_L: case X86::COND_LE:
+      case X86::COND_G:
+      case X86::COND_GE:
+      case X86::COND_L:
+      case X86::COND_LE:
         // If SF is used, but the instruction doesn't update the SF, then we
         // can't do the optimization.
         if (NoSignFlag)
           return false;
         [[fallthrough]];
-      case X86::COND_O: case X86::COND_NO:
+      case X86::COND_O:
+      case X86::COND_NO:
         // If OF is used, the instruction needs to clear it like CmpZero does.
         if (!ClearsOverflowFlag)
           return false;
         break;
-      case X86::COND_S: case X86::COND_NS:
+      case X86::COND_S:
+      case X86::COND_NS:
         // If SF is used, but the instruction doesn't update the SF, then we
         // can't do the optimization.
         if (NoSignFlag)
@@ -4850,130 +5460,130 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
 /// ShiftRotate will be set to true if the Opcode is shift or rotate.
 /// If the ALUri can be further changed to COPY when the immediate is 0, set
 /// CanConvert2Copy to true.
-static unsigned ConvertALUrr2ALUri(unsigned Opcode, bool &CanConvert2Copy,
-                                   bool &ShiftRotate) {
-  CanConvert2Copy = false;
-  ShiftRotate = false;
-  unsigned NewOpcode = 0;
-  switch (Opcode) {
-    case X86::ADD64rr:
-      NewOpcode = X86::ADD64ri32;
-      CanConvert2Copy = true;
-      break;
-    case X86::ADC64rr:
-      NewOpcode = X86::ADC64ri32;
-      break;
-    case X86::SUB64rr:
-      NewOpcode = X86::SUB64ri32;
-      CanConvert2Copy = true;
-      break;
-    case X86::SBB64rr:
-      NewOpcode = X86::SBB64ri32;
-      break;
-    case X86::AND64rr:
-      NewOpcode = X86::AND64ri32;
-      break;
-    case X86::OR64rr:
-      NewOpcode = X86::OR64ri32;
-      CanConvert2Copy = true;
-      break;
-    case X86::XOR64rr:
-      NewOpcode = X86::XOR64ri32;
-      CanConvert2Copy = true;
-      break;
-    case X86::TEST64rr:
-      NewOpcode = X86::TEST64ri32;
-      break;
-    case X86::CMP64rr:
-      NewOpcode = X86::CMP64ri32;
-      break;
-    case X86::SHR64rCL:
-      NewOpcode = X86::SHR64ri;
-      ShiftRotate = true;
-      break;
-    case X86::SHL64rCL:
-      NewOpcode = X86::SHL64ri;
-      ShiftRotate = true;
-      break;
-    case X86::SAR64rCL:
-      NewOpcode = X86::SAR64ri;
-      ShiftRotate = true;
-      break;
-    case X86::ROL64rCL:
-      NewOpcode = X86::ROL64ri;
-      ShiftRotate = true;
-      break;
-    case X86::ROR64rCL:
-      NewOpcode = X86::ROR64ri;
-      ShiftRotate = true;
-      break;
-    case X86::RCL64rCL:
-      NewOpcode = X86::RCL64ri;
-      ShiftRotate = true;
-      break;
-    case X86::RCR64rCL:
-      NewOpcode = X86::RCR64ri;
-      ShiftRotate = true;
-      break;
-    case X86::ADD32rr:
-      NewOpcode = X86::ADD32ri;
-      CanConvert2Copy = true;
-      break;
-    case X86::ADC32rr:
-      NewOpcode = X86::ADC32ri;
-      break;
-    case X86::SUB32rr:
-      NewOpcode = X86::SUB32ri;
-      CanConvert2Copy = true;
-      break;
-    case X86::SBB32rr:
-      NewOpcode = X86::SBB32ri;
-      break;
-    case X86::AND32rr:
-      NewOpcode = X86::AND32ri;
-      break;
-    case X86::OR32rr:
-      NewOpcode = X86::OR32ri;
-      CanConvert2Copy = true;
-      break;
-    case X86::XOR32rr:
-      NewOpcode = X86::XOR32ri;
-      CanConvert2Copy = true;
-      break;
-    case X86::TEST32rr:
-      NewOpcode = X86::TEST32ri;
-      break;
-    case X86::CMP32rr:
-      NewOpcode = X86::CMP32ri;
-      break;
-    case X86::SHR32rCL:
-      NewOpcode = X86::SHR32ri;
-      ShiftRotate = true;
-      break;
-    case X86::SHL32rCL:
-      NewOpcode = X86::SHL32ri;
-      ShiftRotate = true;
-      break;
-    case X86::SAR32rCL:
-      NewOpcode = X86::SAR32ri;
-      ShiftRotate = true;
-      break;
-    case X86::ROL32rCL:
-      NewOpcode = X86::ROL32ri;
-      ShiftRotate = true;
-      break;
-    case X86::ROR32rCL:
-      NewOpcode = X86::ROR32ri;
-      ShiftRotate = true;
-      break;
-    case X86::RCL32rCL:
-      NewOpcode = X86::RCL32ri;
-      ShiftRotate = true;
-      break;
-    case X86::RCR32rCL:
-      NewOpcode = X86::RCR32ri;
-      ShiftRotate = true;
-      break;
+static unsigned ConvertALUrr2ALUri(unsigned Opcode, bool &CanConvert2Copy,
+                                   bool &ShiftRotate) {
+  CanConvert2Copy = false;
+  ShiftRotate = false;
+  unsigned NewOpcode = 0;
+  switch (Opcode) {
+  case X86::ADD64rr:
+    NewOpcode = X86::ADD64ri32;
+    CanConvert2Copy = true;
+    break;
+  case X86::ADC64rr:
+    NewOpcode = X86::ADC64ri32;
+    break;
+  case X86::SUB64rr:
+    NewOpcode = X86::SUB64ri32;
+    CanConvert2Copy = true;
+    break;
+  case X86::SBB64rr:
+    NewOpcode = X86::SBB64ri32;
+    break;
+  case X86::AND64rr:
+    NewOpcode = X86::AND64ri32;
+    break;
+  case X86::OR64rr:
+    NewOpcode = X86::OR64ri32;
+    CanConvert2Copy = true;
+    break;
+  case X86::XOR64rr:
+    NewOpcode = X86::XOR64ri32;
+    CanConvert2Copy = true;
+    break;
+  case X86::TEST64rr:
+    NewOpcode = X86::TEST64ri32;
+    break;
+  case X86::CMP64rr:
+    NewOpcode = X86::CMP64ri32;
+    break;
+  case X86::SHR64rCL:
+    NewOpcode = X86::SHR64ri;
+    ShiftRotate = true;
+    break;
+  case X86::SHL64rCL:
+    NewOpcode = X86::SHL64ri;
+    ShiftRotate = true;
+    break;
+  case X86::SAR64rCL:
+    NewOpcode = X86::SAR64ri;
+    ShiftRotate = true;
+    break;
+  case X86::ROL64rCL:
+    NewOpcode = X86::ROL64ri;
+    ShiftRotate = true;
+    break;
+  case X86::ROR64rCL:
+    NewOpcode = X86::ROR64ri;
+    ShiftRotate = true;
+    break;
+  case X86::RCL64rCL:
+    NewOpcode = X86::RCL64ri;
+    ShiftRotate = true;
+    break;
+  case X86::RCR64rCL:
+    NewOpcode = X86::RCR64ri;
+    ShiftRotate = true;
+    break;
+  case X86::ADD32rr:
+    NewOpcode = X86::ADD32ri;
+    CanConvert2Copy = true;
+    break;
+  case X86::ADC32rr:
+    NewOpcode = X86::ADC32ri;
+    break;
+  case X86::SUB32rr:
+    NewOpcode = X86::SUB32ri;
+    CanConvert2Copy = true;
+    break;
+  case X86::SBB32rr:
+    NewOpcode = X86::SBB32ri;
+    break;
+  case X86::AND32rr:
+    NewOpcode = X86::AND32ri;
+    break;
+  case X86::OR32rr:
+    NewOpcode = X86::OR32ri;
+    CanConvert2Copy = true;
+    break;
+  case X86::XOR32rr:
+    NewOpcode = X86::XOR32ri;
+    CanConvert2Copy = true;
+    break;
+  case X86::TEST32rr:
+    NewOpcode = X86::TEST32ri;
+    break;
+  case X86::CMP32rr:
+    NewOpcode = X86::CMP32ri;
+    break;
+  case X86::SHR32rCL:
+    NewOpcode = X86::SHR32ri;
+    ShiftRotate = true;
+    break;
+  case X86::SHL32rCL:
+    NewOpcode = X86::SHL32ri;
+    ShiftRotate = true;
+    break;
+  case X86::SAR32rCL:
+    NewOpcode = X86::SAR32ri;
+    ShiftRotate = true;
+    break;
+  case X86::ROL32rCL:
+    NewOpcode = X86::ROL32ri;
+    ShiftRotate = true;
+    break;
+  case X86::ROR32rCL:
+    NewOpcode = X86::ROR32ri;
+    ShiftRotate = true;
+    break;
+  case X86::RCL32rCL:
+    NewOpcode = X86::RCL32ri;
+    ShiftRotate = true;
+    break;
+  case X86::RCR32rCL:
+    NewOpcode = X86::RCR32ri;
+    ShiftRotate = true;
+    break;
   }
   return NewOpcode;
 }
@@ -5042,8 +5652,8 @@ bool X86InstrInfo::FoldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
       if (ImmVal == 0) {
         // MOV32r0 clobbers EFLAGS.
         const TargetRegisterInfo *TRI = &getRegisterInfo();
-        if (UseMI.getParent()->computeRegisterLiveness(TRI, X86::EFLAGS, UseMI)
-            != MachineBasicBlock::LQR_Dead)
+        if (UseMI.getParent()->computeRegisterLiveness(
+                TRI, X86::EFLAGS, UseMI) != MachineBasicBlock::LQR_Dead)
           return false;
 
         // MOV32r0 is 
diff erent than other cases because it doesn't encode the
@@ -5052,10 +5662,10 @@ bool X86InstrInfo::FoldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
           return true;
         UseMI.setDesc(get(X86::MOV32r0));
         UseMI.removeOperand(UseMI.findRegisterUseOperandIdx(Reg));
-        UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/ true,
-                                                   /*isImp=*/ true,
-                                                   /*isKill=*/ false,
-                                                   /*isDead=*/ true));
+        UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/true,
+                                                   /*isImp=*/true,
+                                                   /*isKill=*/false,
+                                                   /*isDead=*/true));
         Modified = true;
       }
     } else if (GR8Reg)
@@ -5117,7 +5727,7 @@ bool X86InstrInfo::FoldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
       unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
       unsigned ImmOpNum = 2;
       if (!UseMI.getOperand(0).isDef()) {
-        Op1 = 0;                                      // TEST, CMP
+        Op1 = 0; // TEST, CMP
         ImmOpNum = 1;
       }
       if (Opc == TargetOpcode::COPY)
@@ -5166,8 +5776,7 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
   // implicit operands.
   MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
   // But we don't trust that.
-  assert(MIB.getReg(1) == Reg &&
-         MIB.getReg(2) == Reg && "Misplaced operand");
+  assert(MIB.getReg(1) == Reg && MIB.getReg(2) == Reg && "Misplaced operand");
   return true;
 }
 
@@ -5222,8 +5831,9 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
     X86MachineFunctionInfo *X86FI =
         MBB.getParent()->getInfo<X86MachineFunctionInfo>();
     if (X86FI->getUsesRedZone()) {
-      MIB->setDesc(TII.get(MIB->getOpcode() ==
-                           X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
+      MIB->setDesc(TII.get(MIB->getOpcode() == X86::MOV32ImmSExti8
+                               ? X86::MOV32ri
+                               : X86::MOV64ri));
       return true;
     }
 
@@ -5232,8 +5842,7 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
     StackAdjustment = 8;
     BuildMI(MBB, I, DL, TII.get(X86::PUSH64i32)).addImm(Imm);
     MIB->setDesc(TII.get(X86::POP64r));
-    MIB->getOperand(0)
-        .setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
+    MIB->getOperand(0).setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
   } else {
     assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
     StackAdjustment = 4;
@@ -5250,9 +5859,11 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
   bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
   bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
   if (EmitCFI) {
-    TFL->BuildCFI(MBB, I, DL,
+    TFL->BuildCFI(
+        MBB, I, DL,
         MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
-    TFL->BuildCFI(MBB, std::next(I), DL,
+    TFL->BuildCFI(
+        MBB, std::next(I), DL,
         MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
   }
 
@@ -5275,8 +5886,12 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
       MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
   MachineBasicBlock::iterator I = MIB.getInstr();
 
-  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
-      .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
+  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg)
+      .addReg(X86::RIP)
+      .addImm(1)
+      .addReg(0)
+      .addGlobalAddress(GV, 0, X86II::MO_GOTPCREL)
+      .addReg(0)
       .addMemOperand(MMO);
   MIB->setDebugLoc(DL);
   MIB->setDesc(TII.get(X86::MOV64rm));
@@ -5301,8 +5916,7 @@ static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) {
 static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
                             const TargetRegisterInfo *TRI,
                             const MCInstrDesc &LoadDesc,
-                            const MCInstrDesc &BroadcastDesc,
-                            unsigned SubIdx) {
+                            const MCInstrDesc &BroadcastDesc, unsigned SubIdx) {
   Register DestReg = MIB.getReg(0);
   // Check if DestReg is XMM16-31 or YMM16-31.
   if (TRI->getEncodingValue(DestReg) < 16) {
@@ -5324,8 +5938,7 @@ static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
 static bool expandNOVLXStore(MachineInstrBuilder &MIB,
                              const TargetRegisterInfo *TRI,
                              const MCInstrDesc &StoreDesc,
-                             const MCInstrDesc &ExtractDesc,
-                             unsigned SubIdx) {
+                             const MCInstrDesc &ExtractDesc, unsigned SubIdx) {
   Register SrcReg = MIB.getReg(X86::AddrNumOperands);
   // Check if DestReg is XMM16-31 or YMM16-31.
   if (TRI->getEncodingValue(SrcReg) < 16) {
@@ -5349,8 +5962,7 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
   // Temporarily remove the immediate so we can add another source register.
   MIB->removeOperand(2);
   // Add the register. Don't copy the kill flag if there is one.
-  MIB.addReg(MIB.getReg(1),
-             getUndefRegState(MIB->getOperand(1).isUndef()));
+  MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef()));
   // Add back the immediate.
   MIB.addImm(ShiftAmt);
   return true;
@@ -5363,9 +5975,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::MOV32r0:
     return Expand2AddrUndef(MIB, get(X86::XOR32rr));
   case X86::MOV32r1:
-    return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
+    return expandMOV32r1(MIB, *this, /*MinusOne=*/false);
   case X86::MOV32r_1:
-    return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
+    return expandMOV32r1(MIB, *this, /*MinusOne=*/true);
   case X86::MOV32ImmSExti8:
   case X86::MOV64ImmSExti8:
     return ExpandMOVImmSExti8(MIB, *this, Subtarget);
@@ -5416,21 +6028,21 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
       Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
       MIB->getOperand(0).setReg(XReg);
-      Expand2AddrUndef(MIB,
-                       get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
+      Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
       MIB.addReg(SrcReg, RegState::ImplicitDefine);
       return true;
     }
     if (MI.getOpcode() == X86::AVX512_256_SET0) {
       // No VLX so we must reference a zmm.
       unsigned ZReg =
-        TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
+          TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
       MIB->getOperand(0).setReg(ZReg);
     }
     return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
   }
   case X86::V_SETALLONES:
-    return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+    return Expand2AddrUndef(MIB,
+                            get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
   case X86::AVX2_SETALLONES:
     return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
   case X86::AVX1_SETALLONES: {
@@ -5445,8 +6057,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MIB->setDesc(get(X86::VPTERNLOGDZrri));
     // VPTERNLOGD needs 3 register inputs and an immediate.
     // 0xff will return 1s for any input.
-    MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef)
-       .addReg(Reg, RegState::Undef).addImm(0xff);
+    MIB.addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef)
+        .addImm(0xff);
     return true;
   }
   case X86::AVX512_512_SEXT_MASK_32:
@@ -5454,14 +6068,18 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     Register Reg = MIB.getReg(0);
     Register MaskReg = MIB.getReg(1);
     unsigned MaskState = getRegState(MIB->getOperand(1));
-    unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
-                   X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
+    unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64)
+                       ? X86::VPTERNLOGQZrrikz
+                       : X86::VPTERNLOGDZrrikz;
     MI.removeOperand(1);
     MIB->setDesc(get(Opc));
     // VPTERNLOG needs 3 register inputs and an immediate.
     // 0xff will return 1s for any input.
-    MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
-       .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff);
+    MIB.addReg(Reg, RegState::Undef)
+        .addReg(MaskReg, MaskState)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef)
+        .addImm(0xff);
     return true;
   }
   case X86::VMOVAPSZ128rm_NOVLX:
@@ -5502,10 +6120,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     unsigned Is64Bit = MI.getOpcode() == X86::RDFLAGS64;
     MachineBasicBlock &MBB = *MIB->getParent();
 
-    MachineInstr *NewMI =
-        BuildMI(MBB, MI, MIB->getDebugLoc(),
-                get(Is64Bit ? X86::PUSHF64 : X86::PUSHF32))
-            .getInstr();
+    MachineInstr *NewMI = BuildMI(MBB, MI, MIB->getDebugLoc(),
+                                  get(Is64Bit ? X86::PUSHF64 : X86::PUSHF32))
+                              .getInstr();
 
     // Permit reads of the EFLAGS and DF registers without them being defined.
     // This intrinsic exists to read external processor state in flags, such as
@@ -5543,30 +6160,56 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   // registers, since it is not usable as a write mask.
   // FIXME: A more advanced approach would be to choose the best input mask
   // register based on context.
-  case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
-  case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
-  case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
-  case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
-  case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
-  case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
+  case X86::KSET0W:
+    return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
+  case X86::KSET0D:
+    return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
+  case X86::KSET0Q:
+    return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
+  case X86::KSET1W:
+    return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
+  case X86::KSET1D:
+    return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
+  case X86::KSET1Q:
+    return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
   case TargetOpcode::LOAD_STACK_GUARD:
     expandLoadStackGuard(MIB, *this);
     return true;
   case X86::XOR64_FP:
   case X86::XOR32_FP:
     return expandXorFP(MIB, *this);
-  case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8));
-  case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8));
-  case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8));
-  case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8));
-  case X86::ADD8rr_DB:    MIB->setDesc(get(X86::OR8rr));    break;
-  case X86::ADD16rr_DB:   MIB->setDesc(get(X86::OR16rr));   break;
-  case X86::ADD32rr_DB:   MIB->setDesc(get(X86::OR32rr));   break;
-  case X86::ADD64rr_DB:   MIB->setDesc(get(X86::OR64rr));   break;
-  case X86::ADD8ri_DB:    MIB->setDesc(get(X86::OR8ri));    break;
-  case X86::ADD16ri_DB:   MIB->setDesc(get(X86::OR16ri));   break;
-  case X86::ADD32ri_DB:   MIB->setDesc(get(X86::OR32ri));   break;
-  case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break;
+  case X86::SHLDROT32ri:
+    return expandSHXDROT(MIB, get(X86::SHLD32rri8));
+  case X86::SHLDROT64ri:
+    return expandSHXDROT(MIB, get(X86::SHLD64rri8));
+  case X86::SHRDROT32ri:
+    return expandSHXDROT(MIB, get(X86::SHRD32rri8));
+  case X86::SHRDROT64ri:
+    return expandSHXDROT(MIB, get(X86::SHRD64rri8));
+  case X86::ADD8rr_DB:
+    MIB->setDesc(get(X86::OR8rr));
+    break;
+  case X86::ADD16rr_DB:
+    MIB->setDesc(get(X86::OR16rr));
+    break;
+  case X86::ADD32rr_DB:
+    MIB->setDesc(get(X86::OR32rr));
+    break;
+  case X86::ADD64rr_DB:
+    MIB->setDesc(get(X86::OR64rr));
+    break;
+  case X86::ADD8ri_DB:
+    MIB->setDesc(get(X86::OR8ri));
+    break;
+  case X86::ADD16ri_DB:
+    MIB->setDesc(get(X86::OR16ri));
+    break;
+  case X86::ADD32ri_DB:
+    MIB->setDesc(get(X86::OR32ri));
+    break;
+  case X86::ADD64ri32_DB:
+    MIB->setDesc(get(X86::OR64ri32));
+    break;
   }
   return false;
 }
@@ -5587,8 +6230,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 ///
 /// FIXME: This should be turned into a TSFlags.
 ///
-static bool hasPartialRegUpdate(unsigned Opcode,
-                                const X86Subtarget &Subtarget,
+static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
                                 bool ForLoadFold = false) {
   switch (Opcode) {
   case X86::CVTSI2SSrr:
@@ -6489,9 +7131,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
         int PtrOffset = SrcIdx * 4;
         unsigned NewImm = (DstIdx << 4) | ZMask;
         unsigned NewOpCode =
-            (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm :
-            (MI.getOpcode() == X86::VINSERTPSrr)  ? X86::VINSERTPSrm  :
-                                                    X86::INSERTPSrm;
+            (MI.getOpcode() == X86::VINSERTPSZrr)  ? X86::VINSERTPSZrm
+            : (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm
+                                                   : X86::INSERTPSrm;
         MachineInstr *NewMI =
             FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
         NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
@@ -6511,9 +7153,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
         unsigned NewOpCode =
-            (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
-            (MI.getOpcode() == X86::VMOVHLPSrr)  ? X86::VMOVLPSrm     :
-                                                   X86::MOVLPSrm;
+            (MI.getOpcode() == X86::VMOVHLPSZrr)  ? X86::VMOVLPSZ128rm
+            : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm
+                                                  : X86::MOVLPSrm;
         MachineInstr *NewMI =
             FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
         return NewMI;
@@ -6542,7 +7184,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
 
 static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
                                                MachineInstr &MI) {
-  if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/true) ||
+  if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/ true) ||
       !MI.getOperand(1).isReg())
     return false;
 
@@ -6577,7 +7219,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
 
   // Avoid partial and undef register update stalls unless optimizing for size.
   if (!MF.getFunction().hasOptSize() &&
-      (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
+      (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
        shouldPreventUndefRegUpdateMemFold(MF, MI)))
     return nullptr;
 
@@ -6639,13 +7281,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
         isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0;
     bool FoldedStore =
         isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE);
-    if (Alignment < Align(1ULL << ((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT)))
+    if (Alignment <
+        Align(1ULL << ((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT)))
       return nullptr;
     bool NarrowToMOV32rm = false;
     if (Size) {
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
-                                                  &RI, MF);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       // Check if it's safe to fold the load. If the size of the object is
       // narrower than the load width, then it's not.
@@ -6748,19 +7390,17 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   return nullptr;
 }
 
-MachineInstr *
-X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
-                                    ArrayRef<unsigned> Ops,
-                                    MachineBasicBlock::iterator InsertPt,
-                                    int FrameIndex, LiveIntervals *LIS,
-                                    VirtRegMap *VRM) const {
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
+    VirtRegMap *VRM) const {
   // Check switch flag
   if (NoFusing)
     return nullptr;
 
   // Avoid partial and undef register update stalls unless optimizing for size.
   if (!MF.getFunction().hasOptSize() &&
-      (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
+      (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
        shouldPreventUndefRegUpdateMemFold(MF, MI)))
     return nullptr;
 
@@ -6784,11 +7424,24 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
     switch (MI.getOpcode()) {
-    default: return nullptr;
-    case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
-    case X86::TEST16rr: NewOpc = X86::CMP16ri; RCSize = 2; break;
-    case X86::TEST32rr: NewOpc = X86::CMP32ri; RCSize = 4; break;
-    case X86::TEST64rr: NewOpc = X86::CMP64ri32; RCSize = 8; break;
+    default:
+      return nullptr;
+    case X86::TEST8rr:
+      NewOpc = X86::CMP8ri;
+      RCSize = 1;
+      break;
+    case X86::TEST16rr:
+      NewOpc = X86::CMP16ri;
+      RCSize = 2;
+      break;
+    case X86::TEST32rr:
+      NewOpc = X86::CMP32ri;
+      RCSize = 4;
+      break;
+    case X86::TEST64rr:
+      NewOpc = X86::CMP64ri32;
+      RCSize = 8;
+      break;
     }
     // Check if it's safe to fold the load. If the size of the object is
     // narrower than the load width, then it's not.
@@ -6842,61 +7495,125 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VCVTSS2SDZrr_Int:
     case X86::VCVTSS2SDZrr_Intk:
     case X86::VCVTSS2SDZrr_Intkz:
-    case X86::CVTSS2SIrr_Int:     case X86::CVTSS2SI64rr_Int:
-    case X86::VCVTSS2SIrr_Int:    case X86::VCVTSS2SI64rr_Int:
-    case X86::VCVTSS2SIZrr_Int:   case X86::VCVTSS2SI64Zrr_Int:
-    case X86::CVTTSS2SIrr_Int:    case X86::CVTTSS2SI64rr_Int:
-    case X86::VCVTTSS2SIrr_Int:   case X86::VCVTTSS2SI64rr_Int:
-    case X86::VCVTTSS2SIZrr_Int:  case X86::VCVTTSS2SI64Zrr_Int:
-    case X86::VCVTSS2USIZrr_Int:  case X86::VCVTSS2USI64Zrr_Int:
-    case X86::VCVTTSS2USIZrr_Int: case X86::VCVTTSS2USI64Zrr_Int:
-    case X86::RCPSSr_Int:   case X86::VRCPSSr_Int:
-    case X86::RSQRTSSr_Int: case X86::VRSQRTSSr_Int:
-    case X86::ROUNDSSr_Int: case X86::VROUNDSSr_Int:
-    case X86::COMISSrr_Int: case X86::VCOMISSrr_Int: case X86::VCOMISSZrr_Int:
-    case X86::UCOMISSrr_Int:case X86::VUCOMISSrr_Int:case X86::VUCOMISSZrr_Int:
-    case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
-    case X86::CMPSSrr_Int: case X86::VCMPSSrr_Int: case X86::VCMPSSZrr_Int:
-    case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
-    case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
-    case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
-    case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
-    case X86::SQRTSSr_Int: case X86::VSQRTSSr_Int: case X86::VSQRTSSZr_Int:
-    case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
-    case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz:
+    case X86::CVTSS2SIrr_Int:
+    case X86::CVTSS2SI64rr_Int:
+    case X86::VCVTSS2SIrr_Int:
+    case X86::VCVTSS2SI64rr_Int:
+    case X86::VCVTSS2SIZrr_Int:
+    case X86::VCVTSS2SI64Zrr_Int:
+    case X86::CVTTSS2SIrr_Int:
+    case X86::CVTTSS2SI64rr_Int:
+    case X86::VCVTTSS2SIrr_Int:
+    case X86::VCVTTSS2SI64rr_Int:
+    case X86::VCVTTSS2SIZrr_Int:
+    case X86::VCVTTSS2SI64Zrr_Int:
+    case X86::VCVTSS2USIZrr_Int:
+    case X86::VCVTSS2USI64Zrr_Int:
+    case X86::VCVTTSS2USIZrr_Int:
+    case X86::VCVTTSS2USI64Zrr_Int:
+    case X86::RCPSSr_Int:
+    case X86::VRCPSSr_Int:
+    case X86::RSQRTSSr_Int:
+    case X86::VRSQRTSSr_Int:
+    case X86::ROUNDSSr_Int:
+    case X86::VROUNDSSr_Int:
+    case X86::COMISSrr_Int:
+    case X86::VCOMISSrr_Int:
+    case X86::VCOMISSZrr_Int:
+    case X86::UCOMISSrr_Int:
+    case X86::VUCOMISSrr_Int:
+    case X86::VUCOMISSZrr_Int:
+    case X86::ADDSSrr_Int:
+    case X86::VADDSSrr_Int:
+    case X86::VADDSSZrr_Int:
+    case X86::CMPSSrr_Int:
+    case X86::VCMPSSrr_Int:
+    case X86::VCMPSSZrr_Int:
+    case X86::DIVSSrr_Int:
+    case X86::VDIVSSrr_Int:
+    case X86::VDIVSSZrr_Int:
+    case X86::MAXSSrr_Int:
+    case X86::VMAXSSrr_Int:
+    case X86::VMAXSSZrr_Int:
+    case X86::MINSSrr_Int:
+    case X86::VMINSSrr_Int:
+    case X86::VMINSSZrr_Int:
+    case X86::MULSSrr_Int:
+    case X86::VMULSSrr_Int:
+    case X86::VMULSSZrr_Int:
+    case X86::SQRTSSr_Int:
+    case X86::VSQRTSSr_Int:
+    case X86::VSQRTSSZr_Int:
+    case X86::SUBSSrr_Int:
+    case X86::VSUBSSrr_Int:
+    case X86::VSUBSSZrr_Int:
+    case X86::VADDSSZrr_Intk:
+    case X86::VADDSSZrr_Intkz:
     case X86::VCMPSSZrr_Intk:
-    case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz:
-    case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz:
-    case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz:
-    case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz:
-    case X86::VSQRTSSZr_Intk: case X86::VSQRTSSZr_Intkz:
-    case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz:
-    case X86::VFMADDSS4rr_Int:   case X86::VFNMADDSS4rr_Int:
-    case X86::VFMSUBSS4rr_Int:   case X86::VFNMSUBSS4rr_Int:
-    case X86::VFMADD132SSr_Int:  case X86::VFNMADD132SSr_Int:
-    case X86::VFMADD213SSr_Int:  case X86::VFNMADD213SSr_Int:
-    case X86::VFMADD231SSr_Int:  case X86::VFNMADD231SSr_Int:
-    case X86::VFMSUB132SSr_Int:  case X86::VFNMSUB132SSr_Int:
-    case X86::VFMSUB213SSr_Int:  case X86::VFNMSUB213SSr_Int:
-    case X86::VFMSUB231SSr_Int:  case X86::VFNMSUB231SSr_Int:
-    case X86::VFMADD132SSZr_Int: case X86::VFNMADD132SSZr_Int:
-    case X86::VFMADD213SSZr_Int: case X86::VFNMADD213SSZr_Int:
-    case X86::VFMADD231SSZr_Int: case X86::VFNMADD231SSZr_Int:
-    case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int:
-    case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int:
-    case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int:
-    case X86::VFMADD132SSZr_Intk: case X86::VFNMADD132SSZr_Intk:
-    case X86::VFMADD213SSZr_Intk: case X86::VFNMADD213SSZr_Intk:
-    case X86::VFMADD231SSZr_Intk: case X86::VFNMADD231SSZr_Intk:
-    case X86::VFMSUB132SSZr_Intk: case X86::VFNMSUB132SSZr_Intk:
-    case X86::VFMSUB213SSZr_Intk: case X86::VFNMSUB213SSZr_Intk:
-    case X86::VFMSUB231SSZr_Intk: case X86::VFNMSUB231SSZr_Intk:
-    case X86::VFMADD132SSZr_Intkz: case X86::VFNMADD132SSZr_Intkz:
-    case X86::VFMADD213SSZr_Intkz: case X86::VFNMADD213SSZr_Intkz:
-    case X86::VFMADD231SSZr_Intkz: case X86::VFNMADD231SSZr_Intkz:
-    case X86::VFMSUB132SSZr_Intkz: case X86::VFNMSUB132SSZr_Intkz:
-    case X86::VFMSUB213SSZr_Intkz: case X86::VFNMSUB213SSZr_Intkz:
-    case X86::VFMSUB231SSZr_Intkz: case X86::VFNMSUB231SSZr_Intkz:
+    case X86::VDIVSSZrr_Intk:
+    case X86::VDIVSSZrr_Intkz:
+    case X86::VMAXSSZrr_Intk:
+    case X86::VMAXSSZrr_Intkz:
+    case X86::VMINSSZrr_Intk:
+    case X86::VMINSSZrr_Intkz:
+    case X86::VMULSSZrr_Intk:
+    case X86::VMULSSZrr_Intkz:
+    case X86::VSQRTSSZr_Intk:
+    case X86::VSQRTSSZr_Intkz:
+    case X86::VSUBSSZrr_Intk:
+    case X86::VSUBSSZrr_Intkz:
+    case X86::VFMADDSS4rr_Int:
+    case X86::VFNMADDSS4rr_Int:
+    case X86::VFMSUBSS4rr_Int:
+    case X86::VFNMSUBSS4rr_Int:
+    case X86::VFMADD132SSr_Int:
+    case X86::VFNMADD132SSr_Int:
+    case X86::VFMADD213SSr_Int:
+    case X86::VFNMADD213SSr_Int:
+    case X86::VFMADD231SSr_Int:
+    case X86::VFNMADD231SSr_Int:
+    case X86::VFMSUB132SSr_Int:
+    case X86::VFNMSUB132SSr_Int:
+    case X86::VFMSUB213SSr_Int:
+    case X86::VFNMSUB213SSr_Int:
+    case X86::VFMSUB231SSr_Int:
+    case X86::VFNMSUB231SSr_Int:
+    case X86::VFMADD132SSZr_Int:
+    case X86::VFNMADD132SSZr_Int:
+    case X86::VFMADD213SSZr_Int:
+    case X86::VFNMADD213SSZr_Int:
+    case X86::VFMADD231SSZr_Int:
+    case X86::VFNMADD231SSZr_Int:
+    case X86::VFMSUB132SSZr_Int:
+    case X86::VFNMSUB132SSZr_Int:
+    case X86::VFMSUB213SSZr_Int:
+    case X86::VFNMSUB213SSZr_Int:
+    case X86::VFMSUB231SSZr_Int:
+    case X86::VFNMSUB231SSZr_Int:
+    case X86::VFMADD132SSZr_Intk:
+    case X86::VFNMADD132SSZr_Intk:
+    case X86::VFMADD213SSZr_Intk:
+    case X86::VFNMADD213SSZr_Intk:
+    case X86::VFMADD231SSZr_Intk:
+    case X86::VFNMADD231SSZr_Intk:
+    case X86::VFMSUB132SSZr_Intk:
+    case X86::VFNMSUB132SSZr_Intk:
+    case X86::VFMSUB213SSZr_Intk:
+    case X86::VFNMSUB213SSZr_Intk:
+    case X86::VFMSUB231SSZr_Intk:
+    case X86::VFNMSUB231SSZr_Intk:
+    case X86::VFMADD132SSZr_Intkz:
+    case X86::VFNMADD132SSZr_Intkz:
+    case X86::VFMADD213SSZr_Intkz:
+    case X86::VFNMADD213SSZr_Intkz:
+    case X86::VFMADD231SSZr_Intkz:
+    case X86::VFNMADD231SSZr_Intkz:
+    case X86::VFMSUB132SSZr_Intkz:
+    case X86::VFNMSUB132SSZr_Intkz:
+    case X86::VFMSUB213SSZr_Intkz:
+    case X86::VFNMSUB213SSZr_Intkz:
+    case X86::VFMSUB231SSZr_Intkz:
+    case X86::VFNMSUB231SSZr_Intkz:
     case X86::VFIXUPIMMSSZrri:
     case X86::VFIXUPIMMSSZrrik:
     case X86::VFIXUPIMMSSZrrikz:
@@ -6951,59 +7668,121 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VCVTSD2SSZrr_Int:
     case X86::VCVTSD2SSZrr_Intk:
     case X86::VCVTSD2SSZrr_Intkz:
-    case X86::CVTSD2SIrr_Int:     case X86::CVTSD2SI64rr_Int:
-    case X86::VCVTSD2SIrr_Int:    case X86::VCVTSD2SI64rr_Int:
-    case X86::VCVTSD2SIZrr_Int:   case X86::VCVTSD2SI64Zrr_Int:
-    case X86::CVTTSD2SIrr_Int:    case X86::CVTTSD2SI64rr_Int:
-    case X86::VCVTTSD2SIrr_Int:   case X86::VCVTTSD2SI64rr_Int:
-    case X86::VCVTTSD2SIZrr_Int:  case X86::VCVTTSD2SI64Zrr_Int:
-    case X86::VCVTSD2USIZrr_Int:  case X86::VCVTSD2USI64Zrr_Int:
-    case X86::VCVTTSD2USIZrr_Int: case X86::VCVTTSD2USI64Zrr_Int:
-    case X86::ROUNDSDr_Int: case X86::VROUNDSDr_Int:
-    case X86::COMISDrr_Int: case X86::VCOMISDrr_Int: case X86::VCOMISDZrr_Int:
-    case X86::UCOMISDrr_Int:case X86::VUCOMISDrr_Int:case X86::VUCOMISDZrr_Int:
-    case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
-    case X86::CMPSDrr_Int: case X86::VCMPSDrr_Int: case X86::VCMPSDZrr_Int:
-    case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
-    case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
-    case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
-    case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
-    case X86::SQRTSDr_Int: case X86::VSQRTSDr_Int: case X86::VSQRTSDZr_Int:
-    case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
-    case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz:
+    case X86::CVTSD2SIrr_Int:
+    case X86::CVTSD2SI64rr_Int:
+    case X86::VCVTSD2SIrr_Int:
+    case X86::VCVTSD2SI64rr_Int:
+    case X86::VCVTSD2SIZrr_Int:
+    case X86::VCVTSD2SI64Zrr_Int:
+    case X86::CVTTSD2SIrr_Int:
+    case X86::CVTTSD2SI64rr_Int:
+    case X86::VCVTTSD2SIrr_Int:
+    case X86::VCVTTSD2SI64rr_Int:
+    case X86::VCVTTSD2SIZrr_Int:
+    case X86::VCVTTSD2SI64Zrr_Int:
+    case X86::VCVTSD2USIZrr_Int:
+    case X86::VCVTSD2USI64Zrr_Int:
+    case X86::VCVTTSD2USIZrr_Int:
+    case X86::VCVTTSD2USI64Zrr_Int:
+    case X86::ROUNDSDr_Int:
+    case X86::VROUNDSDr_Int:
+    case X86::COMISDrr_Int:
+    case X86::VCOMISDrr_Int:
+    case X86::VCOMISDZrr_Int:
+    case X86::UCOMISDrr_Int:
+    case X86::VUCOMISDrr_Int:
+    case X86::VUCOMISDZrr_Int:
+    case X86::ADDSDrr_Int:
+    case X86::VADDSDrr_Int:
+    case X86::VADDSDZrr_Int:
+    case X86::CMPSDrr_Int:
+    case X86::VCMPSDrr_Int:
+    case X86::VCMPSDZrr_Int:
+    case X86::DIVSDrr_Int:
+    case X86::VDIVSDrr_Int:
+    case X86::VDIVSDZrr_Int:
+    case X86::MAXSDrr_Int:
+    case X86::VMAXSDrr_Int:
+    case X86::VMAXSDZrr_Int:
+    case X86::MINSDrr_Int:
+    case X86::VMINSDrr_Int:
+    case X86::VMINSDZrr_Int:
+    case X86::MULSDrr_Int:
+    case X86::VMULSDrr_Int:
+    case X86::VMULSDZrr_Int:
+    case X86::SQRTSDr_Int:
+    case X86::VSQRTSDr_Int:
+    case X86::VSQRTSDZr_Int:
+    case X86::SUBSDrr_Int:
+    case X86::VSUBSDrr_Int:
+    case X86::VSUBSDZrr_Int:
+    case X86::VADDSDZrr_Intk:
+    case X86::VADDSDZrr_Intkz:
     case X86::VCMPSDZrr_Intk:
-    case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz:
-    case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz:
-    case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz:
-    case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz:
-    case X86::VSQRTSDZr_Intk: case X86::VSQRTSDZr_Intkz:
-    case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz:
-    case X86::VFMADDSD4rr_Int:   case X86::VFNMADDSD4rr_Int:
-    case X86::VFMSUBSD4rr_Int:   case X86::VFNMSUBSD4rr_Int:
-    case X86::VFMADD132SDr_Int:  case X86::VFNMADD132SDr_Int:
-    case X86::VFMADD213SDr_Int:  case X86::VFNMADD213SDr_Int:
-    case X86::VFMADD231SDr_Int:  case X86::VFNMADD231SDr_Int:
-    case X86::VFMSUB132SDr_Int:  case X86::VFNMSUB132SDr_Int:
-    case X86::VFMSUB213SDr_Int:  case X86::VFNMSUB213SDr_Int:
-    case X86::VFMSUB231SDr_Int:  case X86::VFNMSUB231SDr_Int:
-    case X86::VFMADD132SDZr_Int: case X86::VFNMADD132SDZr_Int:
-    case X86::VFMADD213SDZr_Int: case X86::VFNMADD213SDZr_Int:
-    case X86::VFMADD231SDZr_Int: case X86::VFNMADD231SDZr_Int:
-    case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int:
-    case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int:
-    case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int:
-    case X86::VFMADD132SDZr_Intk: case X86::VFNMADD132SDZr_Intk:
-    case X86::VFMADD213SDZr_Intk: case X86::VFNMADD213SDZr_Intk:
-    case X86::VFMADD231SDZr_Intk: case X86::VFNMADD231SDZr_Intk:
-    case X86::VFMSUB132SDZr_Intk: case X86::VFNMSUB132SDZr_Intk:
-    case X86::VFMSUB213SDZr_Intk: case X86::VFNMSUB213SDZr_Intk:
-    case X86::VFMSUB231SDZr_Intk: case X86::VFNMSUB231SDZr_Intk:
-    case X86::VFMADD132SDZr_Intkz: case X86::VFNMADD132SDZr_Intkz:
-    case X86::VFMADD213SDZr_Intkz: case X86::VFNMADD213SDZr_Intkz:
-    case X86::VFMADD231SDZr_Intkz: case X86::VFNMADD231SDZr_Intkz:
-    case X86::VFMSUB132SDZr_Intkz: case X86::VFNMSUB132SDZr_Intkz:
-    case X86::VFMSUB213SDZr_Intkz: case X86::VFNMSUB213SDZr_Intkz:
-    case X86::VFMSUB231SDZr_Intkz: case X86::VFNMSUB231SDZr_Intkz:
+    case X86::VDIVSDZrr_Intk:
+    case X86::VDIVSDZrr_Intkz:
+    case X86::VMAXSDZrr_Intk:
+    case X86::VMAXSDZrr_Intkz:
+    case X86::VMINSDZrr_Intk:
+    case X86::VMINSDZrr_Intkz:
+    case X86::VMULSDZrr_Intk:
+    case X86::VMULSDZrr_Intkz:
+    case X86::VSQRTSDZr_Intk:
+    case X86::VSQRTSDZr_Intkz:
+    case X86::VSUBSDZrr_Intk:
+    case X86::VSUBSDZrr_Intkz:
+    case X86::VFMADDSD4rr_Int:
+    case X86::VFNMADDSD4rr_Int:
+    case X86::VFMSUBSD4rr_Int:
+    case X86::VFNMSUBSD4rr_Int:
+    case X86::VFMADD132SDr_Int:
+    case X86::VFNMADD132SDr_Int:
+    case X86::VFMADD213SDr_Int:
+    case X86::VFNMADD213SDr_Int:
+    case X86::VFMADD231SDr_Int:
+    case X86::VFNMADD231SDr_Int:
+    case X86::VFMSUB132SDr_Int:
+    case X86::VFNMSUB132SDr_Int:
+    case X86::VFMSUB213SDr_Int:
+    case X86::VFNMSUB213SDr_Int:
+    case X86::VFMSUB231SDr_Int:
+    case X86::VFNMSUB231SDr_Int:
+    case X86::VFMADD132SDZr_Int:
+    case X86::VFNMADD132SDZr_Int:
+    case X86::VFMADD213SDZr_Int:
+    case X86::VFNMADD213SDZr_Int:
+    case X86::VFMADD231SDZr_Int:
+    case X86::VFNMADD231SDZr_Int:
+    case X86::VFMSUB132SDZr_Int:
+    case X86::VFNMSUB132SDZr_Int:
+    case X86::VFMSUB213SDZr_Int:
+    case X86::VFNMSUB213SDZr_Int:
+    case X86::VFMSUB231SDZr_Int:
+    case X86::VFNMSUB231SDZr_Int:
+    case X86::VFMADD132SDZr_Intk:
+    case X86::VFNMADD132SDZr_Intk:
+    case X86::VFMADD213SDZr_Intk:
+    case X86::VFNMADD213SDZr_Intk:
+    case X86::VFMADD231SDZr_Intk:
+    case X86::VFNMADD231SDZr_Intk:
+    case X86::VFMSUB132SDZr_Intk:
+    case X86::VFNMSUB132SDZr_Intk:
+    case X86::VFMSUB213SDZr_Intk:
+    case X86::VFNMSUB213SDZr_Intk:
+    case X86::VFMSUB231SDZr_Intk:
+    case X86::VFNMSUB231SDZr_Intk:
+    case X86::VFMADD132SDZr_Intkz:
+    case X86::VFNMADD132SDZr_Intkz:
+    case X86::VFMADD213SDZr_Intkz:
+    case X86::VFNMADD213SDZr_Intkz:
+    case X86::VFMADD231SDZr_Intkz:
+    case X86::VFNMADD231SDZr_Intkz:
+    case X86::VFMSUB132SDZr_Intkz:
+    case X86::VFNMSUB132SDZr_Intkz:
+    case X86::VFMSUB213SDZr_Intkz:
+    case X86::VFNMSUB213SDZr_Intkz:
+    case X86::VFMSUB231SDZr_Intkz:
+    case X86::VFNMSUB231SDZr_Intkz:
     case X86::VFIXUPIMMSDZrri:
     case X86::VFIXUPIMMSDZrrik:
     case X86::VFIXUPIMMSDZrrikz:
@@ -7057,31 +7836,55 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VMINSHZrr_Int:
     case X86::VMULSHZrr_Int:
     case X86::VSUBSHZrr_Int:
-    case X86::VADDSHZrr_Intk: case X86::VADDSHZrr_Intkz:
+    case X86::VADDSHZrr_Intk:
+    case X86::VADDSHZrr_Intkz:
     case X86::VCMPSHZrr_Intk:
-    case X86::VDIVSHZrr_Intk: case X86::VDIVSHZrr_Intkz:
-    case X86::VMAXSHZrr_Intk: case X86::VMAXSHZrr_Intkz:
-    case X86::VMINSHZrr_Intk: case X86::VMINSHZrr_Intkz:
-    case X86::VMULSHZrr_Intk: case X86::VMULSHZrr_Intkz:
-    case X86::VSUBSHZrr_Intk: case X86::VSUBSHZrr_Intkz:
-    case X86::VFMADD132SHZr_Int: case X86::VFNMADD132SHZr_Int:
-    case X86::VFMADD213SHZr_Int: case X86::VFNMADD213SHZr_Int:
-    case X86::VFMADD231SHZr_Int: case X86::VFNMADD231SHZr_Int:
-    case X86::VFMSUB132SHZr_Int: case X86::VFNMSUB132SHZr_Int:
-    case X86::VFMSUB213SHZr_Int: case X86::VFNMSUB213SHZr_Int:
-    case X86::VFMSUB231SHZr_Int: case X86::VFNMSUB231SHZr_Int:
-    case X86::VFMADD132SHZr_Intk: case X86::VFNMADD132SHZr_Intk:
-    case X86::VFMADD213SHZr_Intk: case X86::VFNMADD213SHZr_Intk:
-    case X86::VFMADD231SHZr_Intk: case X86::VFNMADD231SHZr_Intk:
-    case X86::VFMSUB132SHZr_Intk: case X86::VFNMSUB132SHZr_Intk:
-    case X86::VFMSUB213SHZr_Intk: case X86::VFNMSUB213SHZr_Intk:
-    case X86::VFMSUB231SHZr_Intk: case X86::VFNMSUB231SHZr_Intk:
-    case X86::VFMADD132SHZr_Intkz: case X86::VFNMADD132SHZr_Intkz:
-    case X86::VFMADD213SHZr_Intkz: case X86::VFNMADD213SHZr_Intkz:
-    case X86::VFMADD231SHZr_Intkz: case X86::VFNMADD231SHZr_Intkz:
-    case X86::VFMSUB132SHZr_Intkz: case X86::VFNMSUB132SHZr_Intkz:
-    case X86::VFMSUB213SHZr_Intkz: case X86::VFNMSUB213SHZr_Intkz:
-    case X86::VFMSUB231SHZr_Intkz: case X86::VFNMSUB231SHZr_Intkz:
+    case X86::VDIVSHZrr_Intk:
+    case X86::VDIVSHZrr_Intkz:
+    case X86::VMAXSHZrr_Intk:
+    case X86::VMAXSHZrr_Intkz:
+    case X86::VMINSHZrr_Intk:
+    case X86::VMINSHZrr_Intkz:
+    case X86::VMULSHZrr_Intk:
+    case X86::VMULSHZrr_Intkz:
+    case X86::VSUBSHZrr_Intk:
+    case X86::VSUBSHZrr_Intkz:
+    case X86::VFMADD132SHZr_Int:
+    case X86::VFNMADD132SHZr_Int:
+    case X86::VFMADD213SHZr_Int:
+    case X86::VFNMADD213SHZr_Int:
+    case X86::VFMADD231SHZr_Int:
+    case X86::VFNMADD231SHZr_Int:
+    case X86::VFMSUB132SHZr_Int:
+    case X86::VFNMSUB132SHZr_Int:
+    case X86::VFMSUB213SHZr_Int:
+    case X86::VFNMSUB213SHZr_Int:
+    case X86::VFMSUB231SHZr_Int:
+    case X86::VFNMSUB231SHZr_Int:
+    case X86::VFMADD132SHZr_Intk:
+    case X86::VFNMADD132SHZr_Intk:
+    case X86::VFMADD213SHZr_Intk:
+    case X86::VFNMADD213SHZr_Intk:
+    case X86::VFMADD231SHZr_Intk:
+    case X86::VFNMADD231SHZr_Intk:
+    case X86::VFMSUB132SHZr_Intk:
+    case X86::VFNMSUB132SHZr_Intk:
+    case X86::VFMSUB213SHZr_Intk:
+    case X86::VFNMSUB213SHZr_Intk:
+    case X86::VFMSUB231SHZr_Intk:
+    case X86::VFNMSUB231SHZr_Intk:
+    case X86::VFMADD132SHZr_Intkz:
+    case X86::VFNMADD132SHZr_Intkz:
+    case X86::VFMADD213SHZr_Intkz:
+    case X86::VFNMADD213SHZr_Intkz:
+    case X86::VFMADD231SHZr_Intkz:
+    case X86::VFNMADD231SHZr_Intkz:
+    case X86::VFMSUB132SHZr_Intkz:
+    case X86::VFNMSUB132SHZr_Intkz:
+    case X86::VFMSUB213SHZr_Intkz:
+    case X86::VFNMSUB213SHZr_Intkz:
+    case X86::VFMSUB231SHZr_Intkz:
+    case X86::VFNMSUB231SHZr_Intkz:
       return false;
     default:
       return true;
@@ -7113,11 +7916,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   }
 
   // Check switch flag
-  if (NoFusing) return nullptr;
+  if (NoFusing)
+    return nullptr;
 
   // Avoid partial and undef register update stalls unless optimizing for size.
   if (!MF.getFunction().hasOptSize() &&
-      (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
+      (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
        shouldPreventUndefRegUpdateMemFold(MF, MI)))
     return nullptr;
 
@@ -7163,11 +7967,20 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     switch (MI.getOpcode()) {
-    default: return nullptr;
-    case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
-    case X86::TEST16rr: NewOpc = X86::CMP16ri; break;
-    case X86::TEST32rr: NewOpc = X86::CMP32ri; break;
-    case X86::TEST64rr: NewOpc = X86::CMP64ri32; break;
+    default:
+      return nullptr;
+    case X86::TEST8rr:
+      NewOpc = X86::CMP8ri;
+      break;
+    case X86::TEST16rr:
+      NewOpc = X86::CMP16ri;
+      break;
+    case X86::TEST32rr:
+      NewOpc = X86::CMP32ri;
+      break;
+    case X86::TEST64rr:
+      NewOpc = X86::CMP64ri32;
+      break;
     }
     // Change to CMPXXri r, 0 first.
     MI.setDesc(get(NewOpc));
@@ -7180,7 +7993,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
     return nullptr;
 
-  SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
+  SmallVector<MachineOperand, X86::AddrNumOperands> MOs;
   switch (LoadMI.getOpcode()) {
   case X86::MMX_SET0:
   case X86::V_SET0:
@@ -7248,11 +8061,11 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
                                 4);
 
-    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
-                      Opc == X86::AVX512_512_SETALLONES ||
-                      Opc == X86::AVX1_SETALLONES);
-    const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
-                                    Constant::getNullValue(Ty);
+    bool IsAllOnes =
+        (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
+         Opc == X86::AVX512_512_SETALLONES || Opc == X86::AVX1_SETALLONES);
+    const Constant *C =
+        IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
 
     // Create operands to load from the constant pool entry.
@@ -7328,37 +8141,54 @@ static unsigned getBroadcastOpcode(const X86FoldTableEntry *I,
          "Can't broadcast less than 64 bytes without AVX512VL!");
 
   switch (I->Flags & TB_BCAST_MASK) {
-  default: llvm_unreachable("Unexpected broadcast type!");
+  default:
+    llvm_unreachable("Unexpected broadcast type!");
   case TB_BCAST_D:
     switch (SpillSize) {
-    default: llvm_unreachable("Unknown spill size");
-    case 16: return X86::VPBROADCASTDZ128rm;
-    case 32: return X86::VPBROADCASTDZ256rm;
-    case 64: return X86::VPBROADCASTDZrm;
+    default:
+      llvm_unreachable("Unknown spill size");
+    case 16:
+      return X86::VPBROADCASTDZ128rm;
+    case 32:
+      return X86::VPBROADCASTDZ256rm;
+    case 64:
+      return X86::VPBROADCASTDZrm;
     }
     break;
   case TB_BCAST_Q:
     switch (SpillSize) {
-    default: llvm_unreachable("Unknown spill size");
-    case 16: return X86::VPBROADCASTQZ128rm;
-    case 32: return X86::VPBROADCASTQZ256rm;
-    case 64: return X86::VPBROADCASTQZrm;
+    default:
+      llvm_unreachable("Unknown spill size");
+    case 16:
+      return X86::VPBROADCASTQZ128rm;
+    case 32:
+      return X86::VPBROADCASTQZ256rm;
+    case 64:
+      return X86::VPBROADCASTQZrm;
     }
     break;
   case TB_BCAST_SS:
     switch (SpillSize) {
-    default: llvm_unreachable("Unknown spill size");
-    case 16: return X86::VBROADCASTSSZ128rm;
-    case 32: return X86::VBROADCASTSSZ256rm;
-    case 64: return X86::VBROADCASTSSZrm;
+    default:
+      llvm_unreachable("Unknown spill size");
+    case 16:
+      return X86::VBROADCASTSSZ128rm;
+    case 32:
+      return X86::VBROADCASTSSZ256rm;
+    case 64:
+      return X86::VBROADCASTSSZrm;
     }
     break;
   case TB_BCAST_SD:
     switch (SpillSize) {
-    default: llvm_unreachable("Unknown spill size");
-    case 16: return X86::VMOVDDUPZ128rm;
-    case 32: return X86::VBROADCASTSDZ256rm;
-    case 64: return X86::VBROADCASTSDZrm;
+    default:
+      llvm_unreachable("Unknown spill size");
+    case 16:
+      return X86::VMOVDDUPZ128rm;
+    case 32:
+      return X86::VBROADCASTSDZ256rm;
+    case 64:
+      return X86::VBROADCASTSDZrm;
     }
     break;
   }
@@ -7394,9 +8224,9 @@ bool X86InstrInfo::unfoldMemoryOperand(
     // performance.
     return false;
   SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
-  SmallVector<MachineOperand,2> BeforeOps;
-  SmallVector<MachineOperand,2> AfterOps;
-  SmallVector<MachineOperand,4> ImpOps;
+  SmallVector<MachineOperand, 2> BeforeOps;
+  SmallVector<MachineOperand, 2> AfterOps;
+  SmallVector<MachineOperand, 4> ImpOps;
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &Op = MI.getOperand(i);
     if (i >= Index && i < Index + X86::AddrNumOperands)
@@ -7452,16 +8282,16 @@ bool X86InstrInfo::unfoldMemoryOperand(
   for (MachineOperand &AfterOp : AfterOps)
     MIB.add(AfterOp);
   for (MachineOperand &ImpOp : ImpOps) {
-    MIB.addReg(ImpOp.getReg(),
-               getDefRegState(ImpOp.isDef()) |
-               RegState::Implicit |
-               getKillRegState(ImpOp.isKill()) |
-               getDeadRegState(ImpOp.isDead()) |
-               getUndefRegState(ImpOp.isUndef()));
+    MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) |
+                                   RegState::Implicit |
+                                   getKillRegState(ImpOp.isKill()) |
+                                   getDeadRegState(ImpOp.isDead()) |
+                                   getUndefRegState(ImpOp.isUndef()));
   }
   // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
   switch (DataMI->getOpcode()) {
-  default: break;
+  default:
+    break;
   case X86::CMP64ri32:
   case X86::CMP32ri:
   case X86::CMP16ri:
@@ -7471,11 +8301,20 @@ bool X86InstrInfo::unfoldMemoryOperand(
     if (MO1.isImm() && MO1.getImm() == 0) {
       unsigned NewOpc;
       switch (DataMI->getOpcode()) {
-      default: llvm_unreachable("Unreachable!");
-      case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
-      case X86::CMP32ri:   NewOpc = X86::TEST32rr; break;
-      case X86::CMP16ri:   NewOpc = X86::TEST16rr; break;
-      case X86::CMP8ri:    NewOpc = X86::TEST8rr; break;
+      default:
+        llvm_unreachable("Unreachable!");
+      case X86::CMP64ri32:
+        NewOpc = X86::TEST64rr;
+        break;
+      case X86::CMP32ri:
+        NewOpc = X86::TEST32rr;
+        break;
+      case X86::CMP16ri:
+        NewOpc = X86::TEST16rr;
+        break;
+      case X86::CMP8ri:
+        NewOpc = X86::TEST8rr;
+        break;
       }
       DataMI->setDesc(get(NewOpc));
       MO1.ChangeToRegister(MO0.getReg(), false);
@@ -7503,9 +8342,8 @@ bool X86InstrInfo::unfoldMemoryOperand(
   return true;
 }
 
-bool
-X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
-                                  SmallVectorImpl<SDNode*> &NewNodes) const {
+bool X86InstrInfo::unfoldMemoryOperand(
+    SelectionDAG &DAG, SDNode *N, SmallVectorImpl<SDNode *> &NewNodes) const {
   if (!N->isMachineOpcode())
     return false;
 
@@ -7527,16 +8365,16 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   std::vector<SDValue> AfterOps;
   SDLoc dl(N);
   unsigned NumOps = N->getNumOperands();
-  for (unsigned i = 0; i != NumOps-1; ++i) {
+  for (unsigned i = 0; i != NumOps - 1; ++i) {
     SDValue Op = N->getOperand(i);
-    if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
+    if (i >= Index - NumDefs && i < Index - NumDefs + X86::AddrNumOperands)
       AddrOps.push_back(Op);
-    else if (i < Index-NumDefs)
+    else if (i < Index - NumDefs)
       BeforeOps.push_back(Op);
-    else if (i > Index-NumDefs)
+    else if (i > Index - NumDefs)
       AfterOps.push_back(Op);
   }
-  SDValue Chain = N->getOperand(NumOps-1);
+  SDValue Chain = N->getOperand(NumOps - 1);
   AddrOps.push_back(Chain);
 
   // Emit the load instruction.
@@ -7584,23 +8422,33 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   llvm::append_range(BeforeOps, AfterOps);
   // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
   switch (Opc) {
-    default: break;
-    case X86::CMP64ri32:
-    case X86::CMP32ri:
-    case X86::CMP16ri:
-    case X86::CMP8ri:
-      if (isNullConstant(BeforeOps[1])) {
-        switch (Opc) {
-          default: llvm_unreachable("Unreachable!");
-          case X86::CMP64ri32: Opc = X86::TEST64rr; break;
-          case X86::CMP32ri:   Opc = X86::TEST32rr; break;
-          case X86::CMP16ri:   Opc = X86::TEST16rr; break;
-          case X86::CMP8ri:    Opc = X86::TEST8rr; break;
-        }
-        BeforeOps[1] = BeforeOps[0];
+  default:
+    break;
+  case X86::CMP64ri32:
+  case X86::CMP32ri:
+  case X86::CMP16ri:
+  case X86::CMP8ri:
+    if (isNullConstant(BeforeOps[1])) {
+      switch (Opc) {
+      default:
+        llvm_unreachable("Unreachable!");
+      case X86::CMP64ri32:
+        Opc = X86::TEST64rr;
+        break;
+      case X86::CMP32ri:
+        Opc = X86::TEST32rr;
+        break;
+      case X86::CMP16ri:
+        Opc = X86::TEST16rr;
+        break;
+      case X86::CMP8ri:
+        Opc = X86::TEST8rr;
+        break;
       }
+      BeforeOps[1] = BeforeOps[0];
+    }
   }
-  SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
+  SDNode *NewNode = DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
   NewNodes.push_back(NewNode);
 
   // Emit the store instruction.
@@ -7629,9 +8477,10 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   return true;
 }
 
-unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
-                                      bool UnfoldLoad, bool UnfoldStore,
-                                      unsigned *LoadRegIndex) const {
+unsigned
+X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad,
+                                         bool UnfoldStore,
+                                         unsigned *LoadRegIndex) const {
   const X86FoldTableEntry *I = lookupUnfoldTable(Opc);
   if (I == nullptr)
     return 0;
@@ -7646,9 +8495,9 @@ unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
   return I->DstOp;
 }
 
-bool
-X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
-                                     int64_t &Offset1, int64_t &Offset2) const {
+bool X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                           int64_t &Offset1,
+                                           int64_t &Offset2) const {
   if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
     return false;
 
@@ -7782,10 +8631,11 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   unsigned Opc1 = Load1->getMachineOpcode();
   unsigned Opc2 = Load2->getMachineOpcode();
   if (Opc1 != Opc2)
-    return false;  // FIXME: overly conservative?
+    return false; // FIXME: overly conservative?
 
   switch (Opc1) {
-  default: break;
+  default:
+    break;
   case X86::LD_Fp32m:
   case X86::LD_Fp64m:
   case X86::LD_Fp80m:
@@ -7833,16 +8683,16 @@ bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
 }
 
-bool X86InstrInfo::
-reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+bool X86InstrInfo::reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 1 && "Invalid X86 branch condition!");
   X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
   Cond[0].setImm(GetOppositeBranchCondition(CC));
   return false;
 }
 
-bool X86InstrInfo::
-isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+bool X86InstrInfo::isSafeToMoveRegClassDefs(
+    const TargetRegisterClass *RC) const {
   // FIXME: Return false for x87 stack register classes for now. We can't
   // allow any loads of these registers before FpGet_ST0_80.
   return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
@@ -7876,515 +8726,13 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   return GlobalBaseReg;
 }
 
-// These are the replaceable SSE instructions. Some of these have Int variants
-// that we don't include here. We don't want to replace instructions selected
-// by intrinsics.
-static const uint16_t ReplaceableInstrs[][3] = {
-  //PackedSingle     PackedDouble    PackedInt
-  { X86::MOVAPSmr,   X86::MOVAPDmr,  X86::MOVDQAmr  },
-  { X86::MOVAPSrm,   X86::MOVAPDrm,  X86::MOVDQArm  },
-  { X86::MOVAPSrr,   X86::MOVAPDrr,  X86::MOVDQArr  },
-  { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
-  { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
-  { X86::MOVLPSmr,   X86::MOVLPDmr,  X86::MOVPQI2QImr },
-  { X86::MOVSDmr,    X86::MOVSDmr,   X86::MOVPQI2QImr },
-  { X86::MOVSSmr,    X86::MOVSSmr,   X86::MOVPDI2DImr },
-  { X86::MOVSDrm,    X86::MOVSDrm,   X86::MOVQI2PQIrm },
-  { X86::MOVSDrm_alt,X86::MOVSDrm_alt,X86::MOVQI2PQIrm },
-  { X86::MOVSSrm,    X86::MOVSSrm,   X86::MOVDI2PDIrm },
-  { X86::MOVSSrm_alt,X86::MOVSSrm_alt,X86::MOVDI2PDIrm },
-  { X86::MOVNTPSmr,  X86::MOVNTPDmr, X86::MOVNTDQmr },
-  { X86::ANDNPSrm,   X86::ANDNPDrm,  X86::PANDNrm   },
-  { X86::ANDNPSrr,   X86::ANDNPDrr,  X86::PANDNrr   },
-  { X86::ANDPSrm,    X86::ANDPDrm,   X86::PANDrm    },
-  { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
-  { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
-  { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
-  { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
-  { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
-  { X86::UNPCKLPDrm, X86::UNPCKLPDrm, X86::PUNPCKLQDQrm },
-  { X86::MOVLHPSrr,  X86::UNPCKLPDrr, X86::PUNPCKLQDQrr },
-  { X86::UNPCKHPDrm, X86::UNPCKHPDrm, X86::PUNPCKHQDQrm },
-  { X86::UNPCKHPDrr, X86::UNPCKHPDrr, X86::PUNPCKHQDQrr },
-  { X86::UNPCKLPSrm, X86::UNPCKLPSrm, X86::PUNPCKLDQrm },
-  { X86::UNPCKLPSrr, X86::UNPCKLPSrr, X86::PUNPCKLDQrr },
-  { X86::UNPCKHPSrm, X86::UNPCKHPSrm, X86::PUNPCKHDQrm },
-  { X86::UNPCKHPSrr, X86::UNPCKHPSrr, X86::PUNPCKHDQrr },
-  { X86::EXTRACTPSmr, X86::EXTRACTPSmr, X86::PEXTRDmr },
-  { X86::EXTRACTPSrr, X86::EXTRACTPSrr, X86::PEXTRDrr },
-  // AVX 128-bit support
-  { X86::VMOVAPSmr,  X86::VMOVAPDmr,  X86::VMOVDQAmr  },
-  { X86::VMOVAPSrm,  X86::VMOVAPDrm,  X86::VMOVDQArm  },
-  { X86::VMOVAPSrr,  X86::VMOVAPDrr,  X86::VMOVDQArr  },
-  { X86::VMOVUPSmr,  X86::VMOVUPDmr,  X86::VMOVDQUmr  },
-  { X86::VMOVUPSrm,  X86::VMOVUPDrm,  X86::VMOVDQUrm  },
-  { X86::VMOVLPSmr,  X86::VMOVLPDmr,  X86::VMOVPQI2QImr },
-  { X86::VMOVSDmr,   X86::VMOVSDmr,   X86::VMOVPQI2QImr },
-  { X86::VMOVSSmr,   X86::VMOVSSmr,   X86::VMOVPDI2DImr },
-  { X86::VMOVSDrm,   X86::VMOVSDrm,   X86::VMOVQI2PQIrm },
-  { X86::VMOVSDrm_alt,X86::VMOVSDrm_alt,X86::VMOVQI2PQIrm },
-  { X86::VMOVSSrm,   X86::VMOVSSrm,   X86::VMOVDI2PDIrm },
-  { X86::VMOVSSrm_alt,X86::VMOVSSrm_alt,X86::VMOVDI2PDIrm },
-  { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
-  { X86::VANDNPSrm,  X86::VANDNPDrm,  X86::VPANDNrm   },
-  { X86::VANDNPSrr,  X86::VANDNPDrr,  X86::VPANDNrr   },
-  { X86::VANDPSrm,   X86::VANDPDrm,   X86::VPANDrm    },
-  { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDrr    },
-  { X86::VORPSrm,    X86::VORPDrm,    X86::VPORrm     },
-  { X86::VORPSrr,    X86::VORPDrr,    X86::VPORrr     },
-  { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORrm    },
-  { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORrr    },
-  { X86::VUNPCKLPDrm, X86::VUNPCKLPDrm, X86::VPUNPCKLQDQrm },
-  { X86::VMOVLHPSrr,  X86::VUNPCKLPDrr, X86::VPUNPCKLQDQrr },
-  { X86::VUNPCKHPDrm, X86::VUNPCKHPDrm, X86::VPUNPCKHQDQrm },
-  { X86::VUNPCKHPDrr, X86::VUNPCKHPDrr, X86::VPUNPCKHQDQrr },
-  { X86::VUNPCKLPSrm, X86::VUNPCKLPSrm, X86::VPUNPCKLDQrm },
-  { X86::VUNPCKLPSrr, X86::VUNPCKLPSrr, X86::VPUNPCKLDQrr },
-  { X86::VUNPCKHPSrm, X86::VUNPCKHPSrm, X86::VPUNPCKHDQrm },
-  { X86::VUNPCKHPSrr, X86::VUNPCKHPSrr, X86::VPUNPCKHDQrr },
-  { X86::VEXTRACTPSmr, X86::VEXTRACTPSmr, X86::VPEXTRDmr },
-  { X86::VEXTRACTPSrr, X86::VEXTRACTPSrr, X86::VPEXTRDrr },
-  // AVX 256-bit support
-  { X86::VMOVAPSYmr,   X86::VMOVAPDYmr,   X86::VMOVDQAYmr  },
-  { X86::VMOVAPSYrm,   X86::VMOVAPDYrm,   X86::VMOVDQAYrm  },
-  { X86::VMOVAPSYrr,   X86::VMOVAPDYrr,   X86::VMOVDQAYrr  },
-  { X86::VMOVUPSYmr,   X86::VMOVUPDYmr,   X86::VMOVDQUYmr  },
-  { X86::VMOVUPSYrm,   X86::VMOVUPDYrm,   X86::VMOVDQUYrm  },
-  { X86::VMOVNTPSYmr,  X86::VMOVNTPDYmr,  X86::VMOVNTDQYmr },
-  { X86::VPERMPSYrm,   X86::VPERMPSYrm,   X86::VPERMDYrm },
-  { X86::VPERMPSYrr,   X86::VPERMPSYrr,   X86::VPERMDYrr },
-  { X86::VPERMPDYmi,   X86::VPERMPDYmi,   X86::VPERMQYmi },
-  { X86::VPERMPDYri,   X86::VPERMPDYri,   X86::VPERMQYri },
-  // AVX512 support
-  { X86::VMOVLPSZ128mr,  X86::VMOVLPDZ128mr,  X86::VMOVPQI2QIZmr  },
-  { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
-  { X86::VMOVNTPSZ256mr, X86::VMOVNTPDZ256mr, X86::VMOVNTDQZ256mr },
-  { X86::VMOVNTPSZmr,    X86::VMOVNTPDZmr,    X86::VMOVNTDQZmr    },
-  { X86::VMOVSDZmr,      X86::VMOVSDZmr,      X86::VMOVPQI2QIZmr  },
-  { X86::VMOVSSZmr,      X86::VMOVSSZmr,      X86::VMOVPDI2DIZmr  },
-  { X86::VMOVSDZrm,      X86::VMOVSDZrm,      X86::VMOVQI2PQIZrm  },
-  { X86::VMOVSDZrm_alt,  X86::VMOVSDZrm_alt,  X86::VMOVQI2PQIZrm  },
-  { X86::VMOVSSZrm,      X86::VMOVSSZrm,      X86::VMOVDI2PDIZrm  },
-  { X86::VMOVSSZrm_alt,  X86::VMOVSSZrm_alt,  X86::VMOVDI2PDIZrm  },
-  { X86::VBROADCASTSSZ128rr,X86::VBROADCASTSSZ128rr,X86::VPBROADCASTDZ128rr },
-  { X86::VBROADCASTSSZ128rm,X86::VBROADCASTSSZ128rm,X86::VPBROADCASTDZ128rm },
-  { X86::VBROADCASTSSZ256rr,X86::VBROADCASTSSZ256rr,X86::VPBROADCASTDZ256rr },
-  { X86::VBROADCASTSSZ256rm,X86::VBROADCASTSSZ256rm,X86::VPBROADCASTDZ256rm },
-  { X86::VBROADCASTSSZrr,   X86::VBROADCASTSSZrr,   X86::VPBROADCASTDZrr },
-  { X86::VBROADCASTSSZrm,   X86::VBROADCASTSSZrm,   X86::VPBROADCASTDZrm },
-  { X86::VMOVDDUPZ128rr,    X86::VMOVDDUPZ128rr,    X86::VPBROADCASTQZ128rr },
-  { X86::VMOVDDUPZ128rm,    X86::VMOVDDUPZ128rm,    X86::VPBROADCASTQZ128rm },
-  { X86::VBROADCASTSDZ256rr,X86::VBROADCASTSDZ256rr,X86::VPBROADCASTQZ256rr },
-  { X86::VBROADCASTSDZ256rm,X86::VBROADCASTSDZ256rm,X86::VPBROADCASTQZ256rm },
-  { X86::VBROADCASTSDZrr,   X86::VBROADCASTSDZrr,   X86::VPBROADCASTQZrr },
-  { X86::VBROADCASTSDZrm,   X86::VBROADCASTSDZrm,   X86::VPBROADCASTQZrm },
-  { X86::VINSERTF32x4Zrr,   X86::VINSERTF32x4Zrr,   X86::VINSERTI32x4Zrr },
-  { X86::VINSERTF32x4Zrm,   X86::VINSERTF32x4Zrm,   X86::VINSERTI32x4Zrm },
-  { X86::VINSERTF32x8Zrr,   X86::VINSERTF32x8Zrr,   X86::VINSERTI32x8Zrr },
-  { X86::VINSERTF32x8Zrm,   X86::VINSERTF32x8Zrm,   X86::VINSERTI32x8Zrm },
-  { X86::VINSERTF64x2Zrr,   X86::VINSERTF64x2Zrr,   X86::VINSERTI64x2Zrr },
-  { X86::VINSERTF64x2Zrm,   X86::VINSERTF64x2Zrm,   X86::VINSERTI64x2Zrm },
-  { X86::VINSERTF64x4Zrr,   X86::VINSERTF64x4Zrr,   X86::VINSERTI64x4Zrr },
-  { X86::VINSERTF64x4Zrm,   X86::VINSERTF64x4Zrm,   X86::VINSERTI64x4Zrm },
-  { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rr,X86::VINSERTI32x4Z256rr },
-  { X86::VINSERTF32x4Z256rm,X86::VINSERTF32x4Z256rm,X86::VINSERTI32x4Z256rm },
-  { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rr,X86::VINSERTI64x2Z256rr },
-  { X86::VINSERTF64x2Z256rm,X86::VINSERTF64x2Z256rm,X86::VINSERTI64x2Z256rm },
-  { X86::VEXTRACTF32x4Zrr,   X86::VEXTRACTF32x4Zrr,   X86::VEXTRACTI32x4Zrr },
-  { X86::VEXTRACTF32x4Zmr,   X86::VEXTRACTF32x4Zmr,   X86::VEXTRACTI32x4Zmr },
-  { X86::VEXTRACTF32x8Zrr,   X86::VEXTRACTF32x8Zrr,   X86::VEXTRACTI32x8Zrr },
-  { X86::VEXTRACTF32x8Zmr,   X86::VEXTRACTF32x8Zmr,   X86::VEXTRACTI32x8Zmr },
-  { X86::VEXTRACTF64x2Zrr,   X86::VEXTRACTF64x2Zrr,   X86::VEXTRACTI64x2Zrr },
-  { X86::VEXTRACTF64x2Zmr,   X86::VEXTRACTF64x2Zmr,   X86::VEXTRACTI64x2Zmr },
-  { X86::VEXTRACTF64x4Zrr,   X86::VEXTRACTF64x4Zrr,   X86::VEXTRACTI64x4Zrr },
-  { X86::VEXTRACTF64x4Zmr,   X86::VEXTRACTF64x4Zmr,   X86::VEXTRACTI64x4Zmr },
-  { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTI32x4Z256rr },
-  { X86::VEXTRACTF32x4Z256mr,X86::VEXTRACTF32x4Z256mr,X86::VEXTRACTI32x4Z256mr },
-  { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTI64x2Z256rr },
-  { X86::VEXTRACTF64x2Z256mr,X86::VEXTRACTF64x2Z256mr,X86::VEXTRACTI64x2Z256mr },
-  { X86::VPERMILPSmi,        X86::VPERMILPSmi,        X86::VPSHUFDmi },
-  { X86::VPERMILPSri,        X86::VPERMILPSri,        X86::VPSHUFDri },
-  { X86::VPERMILPSZ128mi,    X86::VPERMILPSZ128mi,    X86::VPSHUFDZ128mi },
-  { X86::VPERMILPSZ128ri,    X86::VPERMILPSZ128ri,    X86::VPSHUFDZ128ri },
-  { X86::VPERMILPSZ256mi,    X86::VPERMILPSZ256mi,    X86::VPSHUFDZ256mi },
-  { X86::VPERMILPSZ256ri,    X86::VPERMILPSZ256ri,    X86::VPSHUFDZ256ri },
-  { X86::VPERMILPSZmi,       X86::VPERMILPSZmi,       X86::VPSHUFDZmi },
-  { X86::VPERMILPSZri,       X86::VPERMILPSZri,       X86::VPSHUFDZri },
-  { X86::VPERMPSZ256rm,      X86::VPERMPSZ256rm,      X86::VPERMDZ256rm },
-  { X86::VPERMPSZ256rr,      X86::VPERMPSZ256rr,      X86::VPERMDZ256rr },
-  { X86::VPERMPDZ256mi,      X86::VPERMPDZ256mi,      X86::VPERMQZ256mi },
-  { X86::VPERMPDZ256ri,      X86::VPERMPDZ256ri,      X86::VPERMQZ256ri },
-  { X86::VPERMPDZ256rm,      X86::VPERMPDZ256rm,      X86::VPERMQZ256rm },
-  { X86::VPERMPDZ256rr,      X86::VPERMPDZ256rr,      X86::VPERMQZ256rr },
-  { X86::VPERMPSZrm,         X86::VPERMPSZrm,         X86::VPERMDZrm },
-  { X86::VPERMPSZrr,         X86::VPERMPSZrr,         X86::VPERMDZrr },
-  { X86::VPERMPDZmi,         X86::VPERMPDZmi,         X86::VPERMQZmi },
-  { X86::VPERMPDZri,         X86::VPERMPDZri,         X86::VPERMQZri },
-  { X86::VPERMPDZrm,         X86::VPERMPDZrm,         X86::VPERMQZrm },
-  { X86::VPERMPDZrr,         X86::VPERMPDZrr,         X86::VPERMQZrr },
-  { X86::VUNPCKLPDZ256rm,    X86::VUNPCKLPDZ256rm,    X86::VPUNPCKLQDQZ256rm },
-  { X86::VUNPCKLPDZ256rr,    X86::VUNPCKLPDZ256rr,    X86::VPUNPCKLQDQZ256rr },
-  { X86::VUNPCKHPDZ256rm,    X86::VUNPCKHPDZ256rm,    X86::VPUNPCKHQDQZ256rm },
-  { X86::VUNPCKHPDZ256rr,    X86::VUNPCKHPDZ256rr,    X86::VPUNPCKHQDQZ256rr },
-  { X86::VUNPCKLPSZ256rm,    X86::VUNPCKLPSZ256rm,    X86::VPUNPCKLDQZ256rm },
-  { X86::VUNPCKLPSZ256rr,    X86::VUNPCKLPSZ256rr,    X86::VPUNPCKLDQZ256rr },
-  { X86::VUNPCKHPSZ256rm,    X86::VUNPCKHPSZ256rm,    X86::VPUNPCKHDQZ256rm },
-  { X86::VUNPCKHPSZ256rr,    X86::VUNPCKHPSZ256rr,    X86::VPUNPCKHDQZ256rr },
-  { X86::VUNPCKLPDZ128rm,    X86::VUNPCKLPDZ128rm,    X86::VPUNPCKLQDQZ128rm },
-  { X86::VMOVLHPSZrr,        X86::VUNPCKLPDZ128rr,    X86::VPUNPCKLQDQZ128rr },
-  { X86::VUNPCKHPDZ128rm,    X86::VUNPCKHPDZ128rm,    X86::VPUNPCKHQDQZ128rm },
-  { X86::VUNPCKHPDZ128rr,    X86::VUNPCKHPDZ128rr,    X86::VPUNPCKHQDQZ128rr },
-  { X86::VUNPCKLPSZ128rm,    X86::VUNPCKLPSZ128rm,    X86::VPUNPCKLDQZ128rm },
-  { X86::VUNPCKLPSZ128rr,    X86::VUNPCKLPSZ128rr,    X86::VPUNPCKLDQZ128rr },
-  { X86::VUNPCKHPSZ128rm,    X86::VUNPCKHPSZ128rm,    X86::VPUNPCKHDQZ128rm },
-  { X86::VUNPCKHPSZ128rr,    X86::VUNPCKHPSZ128rr,    X86::VPUNPCKHDQZ128rr },
-  { X86::VUNPCKLPDZrm,       X86::VUNPCKLPDZrm,       X86::VPUNPCKLQDQZrm },
-  { X86::VUNPCKLPDZrr,       X86::VUNPCKLPDZrr,       X86::VPUNPCKLQDQZrr },
-  { X86::VUNPCKHPDZrm,       X86::VUNPCKHPDZrm,       X86::VPUNPCKHQDQZrm },
-  { X86::VUNPCKHPDZrr,       X86::VUNPCKHPDZrr,       X86::VPUNPCKHQDQZrr },
-  { X86::VUNPCKLPSZrm,       X86::VUNPCKLPSZrm,       X86::VPUNPCKLDQZrm },
-  { X86::VUNPCKLPSZrr,       X86::VUNPCKLPSZrr,       X86::VPUNPCKLDQZrr },
-  { X86::VUNPCKHPSZrm,       X86::VUNPCKHPSZrm,       X86::VPUNPCKHDQZrm },
-  { X86::VUNPCKHPSZrr,       X86::VUNPCKHPSZrr,       X86::VPUNPCKHDQZrr },
-  { X86::VEXTRACTPSZmr,      X86::VEXTRACTPSZmr,      X86::VPEXTRDZmr },
-  { X86::VEXTRACTPSZrr,      X86::VEXTRACTPSZrr,      X86::VPEXTRDZrr },
-};
-
-static const uint16_t ReplaceableInstrsAVX2[][3] = {
-  //PackedSingle       PackedDouble       PackedInt
-  { X86::VANDNPSYrm,   X86::VANDNPDYrm,   X86::VPANDNYrm   },
-  { X86::VANDNPSYrr,   X86::VANDNPDYrr,   X86::VPANDNYrr   },
-  { X86::VANDPSYrm,    X86::VANDPDYrm,    X86::VPANDYrm    },
-  { X86::VANDPSYrr,    X86::VANDPDYrr,    X86::VPANDYrr    },
-  { X86::VORPSYrm,     X86::VORPDYrm,     X86::VPORYrm     },
-  { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
-  { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
-  { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
-  { X86::VPERM2F128rm,   X86::VPERM2F128rm,   X86::VPERM2I128rm },
-  { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr },
-  { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
-  { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
-  { X86::VMOVDDUPrm,     X86::VMOVDDUPrm,     X86::VPBROADCASTQrm},
-  { X86::VMOVDDUPrr,     X86::VMOVDDUPrr,     X86::VPBROADCASTQrr},
-  { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
-  { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
-  { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
-  { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
-  { X86::VBROADCASTF128,  X86::VBROADCASTF128,  X86::VBROADCASTI128 },
-  { X86::VBLENDPSYrri,    X86::VBLENDPSYrri,    X86::VPBLENDDYrri },
-  { X86::VBLENDPSYrmi,    X86::VBLENDPSYrmi,    X86::VPBLENDDYrmi },
-  { X86::VPERMILPSYmi,    X86::VPERMILPSYmi,    X86::VPSHUFDYmi },
-  { X86::VPERMILPSYri,    X86::VPERMILPSYri,    X86::VPSHUFDYri },
-  { X86::VUNPCKLPDYrm,    X86::VUNPCKLPDYrm,    X86::VPUNPCKLQDQYrm },
-  { X86::VUNPCKLPDYrr,    X86::VUNPCKLPDYrr,    X86::VPUNPCKLQDQYrr },
-  { X86::VUNPCKHPDYrm,    X86::VUNPCKHPDYrm,    X86::VPUNPCKHQDQYrm },
-  { X86::VUNPCKHPDYrr,    X86::VUNPCKHPDYrr,    X86::VPUNPCKHQDQYrr },
-  { X86::VUNPCKLPSYrm,    X86::VUNPCKLPSYrm,    X86::VPUNPCKLDQYrm },
-  { X86::VUNPCKLPSYrr,    X86::VUNPCKLPSYrr,    X86::VPUNPCKLDQYrr },
-  { X86::VUNPCKHPSYrm,    X86::VUNPCKHPSYrm,    X86::VPUNPCKHDQYrm },
-  { X86::VUNPCKHPSYrr,    X86::VUNPCKHPSYrr,    X86::VPUNPCKHDQYrr },
-};
-
-static const uint16_t ReplaceableInstrsFP[][3] = {
-  //PackedSingle         PackedDouble
-  { X86::MOVLPSrm,       X86::MOVLPDrm,      X86::INSTRUCTION_LIST_END },
-  { X86::MOVHPSrm,       X86::MOVHPDrm,      X86::INSTRUCTION_LIST_END },
-  { X86::MOVHPSmr,       X86::MOVHPDmr,      X86::INSTRUCTION_LIST_END },
-  { X86::VMOVLPSrm,      X86::VMOVLPDrm,     X86::INSTRUCTION_LIST_END },
-  { X86::VMOVHPSrm,      X86::VMOVHPDrm,     X86::INSTRUCTION_LIST_END },
-  { X86::VMOVHPSmr,      X86::VMOVHPDmr,     X86::INSTRUCTION_LIST_END },
-  { X86::VMOVLPSZ128rm,  X86::VMOVLPDZ128rm, X86::INSTRUCTION_LIST_END },
-  { X86::VMOVHPSZ128rm,  X86::VMOVHPDZ128rm, X86::INSTRUCTION_LIST_END },
-  { X86::VMOVHPSZ128mr,  X86::VMOVHPDZ128mr, X86::INSTRUCTION_LIST_END },
-};
-
-static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = {
-  //PackedSingle       PackedDouble       PackedInt
-  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
-  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
-  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
-  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
-};
-
-static const uint16_t ReplaceableInstrsAVX512[][4] = {
-  // Two integer columns for 64-bit and 32-bit elements.
-  //PackedSingle        PackedDouble        PackedInt             PackedInt
-  { X86::VMOVAPSZ128mr, X86::VMOVAPDZ128mr, X86::VMOVDQA64Z128mr, X86::VMOVDQA32Z128mr  },
-  { X86::VMOVAPSZ128rm, X86::VMOVAPDZ128rm, X86::VMOVDQA64Z128rm, X86::VMOVDQA32Z128rm  },
-  { X86::VMOVAPSZ128rr, X86::VMOVAPDZ128rr, X86::VMOVDQA64Z128rr, X86::VMOVDQA32Z128rr  },
-  { X86::VMOVUPSZ128mr, X86::VMOVUPDZ128mr, X86::VMOVDQU64Z128mr, X86::VMOVDQU32Z128mr  },
-  { X86::VMOVUPSZ128rm, X86::VMOVUPDZ128rm, X86::VMOVDQU64Z128rm, X86::VMOVDQU32Z128rm  },
-  { X86::VMOVAPSZ256mr, X86::VMOVAPDZ256mr, X86::VMOVDQA64Z256mr, X86::VMOVDQA32Z256mr  },
-  { X86::VMOVAPSZ256rm, X86::VMOVAPDZ256rm, X86::VMOVDQA64Z256rm, X86::VMOVDQA32Z256rm  },
-  { X86::VMOVAPSZ256rr, X86::VMOVAPDZ256rr, X86::VMOVDQA64Z256rr, X86::VMOVDQA32Z256rr  },
-  { X86::VMOVUPSZ256mr, X86::VMOVUPDZ256mr, X86::VMOVDQU64Z256mr, X86::VMOVDQU32Z256mr  },
-  { X86::VMOVUPSZ256rm, X86::VMOVUPDZ256rm, X86::VMOVDQU64Z256rm, X86::VMOVDQU32Z256rm  },
-  { X86::VMOVAPSZmr,    X86::VMOVAPDZmr,    X86::VMOVDQA64Zmr,    X86::VMOVDQA32Zmr     },
-  { X86::VMOVAPSZrm,    X86::VMOVAPDZrm,    X86::VMOVDQA64Zrm,    X86::VMOVDQA32Zrm     },
-  { X86::VMOVAPSZrr,    X86::VMOVAPDZrr,    X86::VMOVDQA64Zrr,    X86::VMOVDQA32Zrr     },
-  { X86::VMOVUPSZmr,    X86::VMOVUPDZmr,    X86::VMOVDQU64Zmr,    X86::VMOVDQU32Zmr     },
-  { X86::VMOVUPSZrm,    X86::VMOVUPDZrm,    X86::VMOVDQU64Zrm,    X86::VMOVDQU32Zrm     },
-};
-
-static const uint16_t ReplaceableInstrsAVX512DQ[][4] = {
-  // Two integer columns for 64-bit and 32-bit elements.
-  //PackedSingle        PackedDouble        PackedInt           PackedInt
-  { X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
-  { X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
-  { X86::VANDPSZ128rm,  X86::VANDPDZ128rm,  X86::VPANDQZ128rm,  X86::VPANDDZ128rm  },
-  { X86::VANDPSZ128rr,  X86::VANDPDZ128rr,  X86::VPANDQZ128rr,  X86::VPANDDZ128rr  },
-  { X86::VORPSZ128rm,   X86::VORPDZ128rm,   X86::VPORQZ128rm,   X86::VPORDZ128rm   },
-  { X86::VORPSZ128rr,   X86::VORPDZ128rr,   X86::VPORQZ128rr,   X86::VPORDZ128rr   },
-  { X86::VXORPSZ128rm,  X86::VXORPDZ128rm,  X86::VPXORQZ128rm,  X86::VPXORDZ128rm  },
-  { X86::VXORPSZ128rr,  X86::VXORPDZ128rr,  X86::VPXORQZ128rr,  X86::VPXORDZ128rr  },
-  { X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
-  { X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
-  { X86::VANDPSZ256rm,  X86::VANDPDZ256rm,  X86::VPANDQZ256rm,  X86::VPANDDZ256rm  },
-  { X86::VANDPSZ256rr,  X86::VANDPDZ256rr,  X86::VPANDQZ256rr,  X86::VPANDDZ256rr  },
-  { X86::VORPSZ256rm,   X86::VORPDZ256rm,   X86::VPORQZ256rm,   X86::VPORDZ256rm   },
-  { X86::VORPSZ256rr,   X86::VORPDZ256rr,   X86::VPORQZ256rr,   X86::VPORDZ256rr   },
-  { X86::VXORPSZ256rm,  X86::VXORPDZ256rm,  X86::VPXORQZ256rm,  X86::VPXORDZ256rm  },
-  { X86::VXORPSZ256rr,  X86::VXORPDZ256rr,  X86::VPXORQZ256rr,  X86::VPXORDZ256rr  },
-  { X86::VANDNPSZrm,    X86::VANDNPDZrm,    X86::VPANDNQZrm,    X86::VPANDNDZrm    },
-  { X86::VANDNPSZrr,    X86::VANDNPDZrr,    X86::VPANDNQZrr,    X86::VPANDNDZrr    },
-  { X86::VANDPSZrm,     X86::VANDPDZrm,     X86::VPANDQZrm,     X86::VPANDDZrm     },
-  { X86::VANDPSZrr,     X86::VANDPDZrr,     X86::VPANDQZrr,     X86::VPANDDZrr     },
-  { X86::VORPSZrm,      X86::VORPDZrm,      X86::VPORQZrm,      X86::VPORDZrm      },
-  { X86::VORPSZrr,      X86::VORPDZrr,      X86::VPORQZrr,      X86::VPORDZrr      },
-  { X86::VXORPSZrm,     X86::VXORPDZrm,     X86::VPXORQZrm,     X86::VPXORDZrm     },
-  { X86::VXORPSZrr,     X86::VXORPDZrr,     X86::VPXORQZrr,     X86::VPXORDZrr     },
-};
-
-static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
-  // Two integer columns for 64-bit and 32-bit elements.
-  //PackedSingle          PackedDouble
-  //PackedInt             PackedInt
-  { X86::VANDNPSZ128rmk,  X86::VANDNPDZ128rmk,
-    X86::VPANDNQZ128rmk,  X86::VPANDNDZ128rmk  },
-  { X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz,
-    X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz },
-  { X86::VANDNPSZ128rrk,  X86::VANDNPDZ128rrk,
-    X86::VPANDNQZ128rrk,  X86::VPANDNDZ128rrk  },
-  { X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz,
-    X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz },
-  { X86::VANDPSZ128rmk,   X86::VANDPDZ128rmk,
-    X86::VPANDQZ128rmk,   X86::VPANDDZ128rmk   },
-  { X86::VANDPSZ128rmkz,  X86::VANDPDZ128rmkz,
-    X86::VPANDQZ128rmkz,  X86::VPANDDZ128rmkz  },
-  { X86::VANDPSZ128rrk,   X86::VANDPDZ128rrk,
-    X86::VPANDQZ128rrk,   X86::VPANDDZ128rrk   },
-  { X86::VANDPSZ128rrkz,  X86::VANDPDZ128rrkz,
-    X86::VPANDQZ128rrkz,  X86::VPANDDZ128rrkz  },
-  { X86::VORPSZ128rmk,    X86::VORPDZ128rmk,
-    X86::VPORQZ128rmk,    X86::VPORDZ128rmk    },
-  { X86::VORPSZ128rmkz,   X86::VORPDZ128rmkz,
-    X86::VPORQZ128rmkz,   X86::VPORDZ128rmkz   },
-  { X86::VORPSZ128rrk,    X86::VORPDZ128rrk,
-    X86::VPORQZ128rrk,    X86::VPORDZ128rrk    },
-  { X86::VORPSZ128rrkz,   X86::VORPDZ128rrkz,
-    X86::VPORQZ128rrkz,   X86::VPORDZ128rrkz   },
-  { X86::VXORPSZ128rmk,   X86::VXORPDZ128rmk,
-    X86::VPXORQZ128rmk,   X86::VPXORDZ128rmk   },
-  { X86::VXORPSZ128rmkz,  X86::VXORPDZ128rmkz,
-    X86::VPXORQZ128rmkz,  X86::VPXORDZ128rmkz  },
-  { X86::VXORPSZ128rrk,   X86::VXORPDZ128rrk,
-    X86::VPXORQZ128rrk,   X86::VPXORDZ128rrk   },
-  { X86::VXORPSZ128rrkz,  X86::VXORPDZ128rrkz,
-    X86::VPXORQZ128rrkz,  X86::VPXORDZ128rrkz  },
-  { X86::VANDNPSZ256rmk,  X86::VANDNPDZ256rmk,
-    X86::VPANDNQZ256rmk,  X86::VPANDNDZ256rmk  },
-  { X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz,
-    X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz },
-  { X86::VANDNPSZ256rrk,  X86::VANDNPDZ256rrk,
-    X86::VPANDNQZ256rrk,  X86::VPANDNDZ256rrk  },
-  { X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz,
-    X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz },
-  { X86::VANDPSZ256rmk,   X86::VANDPDZ256rmk,
-    X86::VPANDQZ256rmk,   X86::VPANDDZ256rmk   },
-  { X86::VANDPSZ256rmkz,  X86::VANDPDZ256rmkz,
-    X86::VPANDQZ256rmkz,  X86::VPANDDZ256rmkz  },
-  { X86::VANDPSZ256rrk,   X86::VANDPDZ256rrk,
-    X86::VPANDQZ256rrk,   X86::VPANDDZ256rrk   },
-  { X86::VANDPSZ256rrkz,  X86::VANDPDZ256rrkz,
-    X86::VPANDQZ256rrkz,  X86::VPANDDZ256rrkz  },
-  { X86::VORPSZ256rmk,    X86::VORPDZ256rmk,
-    X86::VPORQZ256rmk,    X86::VPORDZ256rmk    },
-  { X86::VORPSZ256rmkz,   X86::VORPDZ256rmkz,
-    X86::VPORQZ256rmkz,   X86::VPORDZ256rmkz   },
-  { X86::VORPSZ256rrk,    X86::VORPDZ256rrk,
-    X86::VPORQZ256rrk,    X86::VPORDZ256rrk    },
-  { X86::VORPSZ256rrkz,   X86::VORPDZ256rrkz,
-    X86::VPORQZ256rrkz,   X86::VPORDZ256rrkz   },
-  { X86::VXORPSZ256rmk,   X86::VXORPDZ256rmk,
-    X86::VPXORQZ256rmk,   X86::VPXORDZ256rmk   },
-  { X86::VXORPSZ256rmkz,  X86::VXORPDZ256rmkz,
-    X86::VPXORQZ256rmkz,  X86::VPXORDZ256rmkz  },
-  { X86::VXORPSZ256rrk,   X86::VXORPDZ256rrk,
-    X86::VPXORQZ256rrk,   X86::VPXORDZ256rrk   },
-  { X86::VXORPSZ256rrkz,  X86::VXORPDZ256rrkz,
-    X86::VPXORQZ256rrkz,  X86::VPXORDZ256rrkz  },
-  { X86::VANDNPSZrmk,     X86::VANDNPDZrmk,
-    X86::VPANDNQZrmk,     X86::VPANDNDZrmk     },
-  { X86::VANDNPSZrmkz,    X86::VANDNPDZrmkz,
-    X86::VPANDNQZrmkz,    X86::VPANDNDZrmkz    },
-  { X86::VANDNPSZrrk,     X86::VANDNPDZrrk,
-    X86::VPANDNQZrrk,     X86::VPANDNDZrrk     },
-  { X86::VANDNPSZrrkz,    X86::VANDNPDZrrkz,
-    X86::VPANDNQZrrkz,    X86::VPANDNDZrrkz    },
-  { X86::VANDPSZrmk,      X86::VANDPDZrmk,
-    X86::VPANDQZrmk,      X86::VPANDDZrmk      },
-  { X86::VANDPSZrmkz,     X86::VANDPDZrmkz,
-    X86::VPANDQZrmkz,     X86::VPANDDZrmkz     },
-  { X86::VANDPSZrrk,      X86::VANDPDZrrk,
-    X86::VPANDQZrrk,      X86::VPANDDZrrk      },
-  { X86::VANDPSZrrkz,     X86::VANDPDZrrkz,
-    X86::VPANDQZrrkz,     X86::VPANDDZrrkz     },
-  { X86::VORPSZrmk,       X86::VORPDZrmk,
-    X86::VPORQZrmk,       X86::VPORDZrmk       },
-  { X86::VORPSZrmkz,      X86::VORPDZrmkz,
-    X86::VPORQZrmkz,      X86::VPORDZrmkz      },
-  { X86::VORPSZrrk,       X86::VORPDZrrk,
-    X86::VPORQZrrk,       X86::VPORDZrrk       },
-  { X86::VORPSZrrkz,      X86::VORPDZrrkz,
-    X86::VPORQZrrkz,      X86::VPORDZrrkz      },
-  { X86::VXORPSZrmk,      X86::VXORPDZrmk,
-    X86::VPXORQZrmk,      X86::VPXORDZrmk      },
-  { X86::VXORPSZrmkz,     X86::VXORPDZrmkz,
-    X86::VPXORQZrmkz,     X86::VPXORDZrmkz     },
-  { X86::VXORPSZrrk,      X86::VXORPDZrrk,
-    X86::VPXORQZrrk,      X86::VPXORDZrrk      },
-  { X86::VXORPSZrrkz,     X86::VXORPDZrrkz,
-    X86::VPXORQZrrkz,     X86::VPXORDZrrkz     },
-  // Broadcast loads can be handled the same as masked operations to avoid
-  // changing element size.
-  { X86::VANDNPSZ128rmb,  X86::VANDNPDZ128rmb,
-    X86::VPANDNQZ128rmb,  X86::VPANDNDZ128rmb  },
-  { X86::VANDPSZ128rmb,   X86::VANDPDZ128rmb,
-    X86::VPANDQZ128rmb,   X86::VPANDDZ128rmb   },
-  { X86::VORPSZ128rmb,    X86::VORPDZ128rmb,
-    X86::VPORQZ128rmb,    X86::VPORDZ128rmb    },
-  { X86::VXORPSZ128rmb,   X86::VXORPDZ128rmb,
-    X86::VPXORQZ128rmb,   X86::VPXORDZ128rmb   },
-  { X86::VANDNPSZ256rmb,  X86::VANDNPDZ256rmb,
-    X86::VPANDNQZ256rmb,  X86::VPANDNDZ256rmb  },
-  { X86::VANDPSZ256rmb,   X86::VANDPDZ256rmb,
-    X86::VPANDQZ256rmb,   X86::VPANDDZ256rmb   },
-  { X86::VORPSZ256rmb,    X86::VORPDZ256rmb,
-    X86::VPORQZ256rmb,    X86::VPORDZ256rmb    },
-  { X86::VXORPSZ256rmb,   X86::VXORPDZ256rmb,
-    X86::VPXORQZ256rmb,   X86::VPXORDZ256rmb   },
-  { X86::VANDNPSZrmb,     X86::VANDNPDZrmb,
-    X86::VPANDNQZrmb,     X86::VPANDNDZrmb     },
-  { X86::VANDPSZrmb,      X86::VANDPDZrmb,
-    X86::VPANDQZrmb,      X86::VPANDDZrmb      },
-  { X86::VANDPSZrmb,      X86::VANDPDZrmb,
-    X86::VPANDQZrmb,      X86::VPANDDZrmb      },
-  { X86::VORPSZrmb,       X86::VORPDZrmb,
-    X86::VPORQZrmb,       X86::VPORDZrmb       },
-  { X86::VXORPSZrmb,      X86::VXORPDZrmb,
-    X86::VPXORQZrmb,      X86::VPXORDZrmb      },
-  { X86::VANDNPSZ128rmbk, X86::VANDNPDZ128rmbk,
-    X86::VPANDNQZ128rmbk, X86::VPANDNDZ128rmbk },
-  { X86::VANDPSZ128rmbk,  X86::VANDPDZ128rmbk,
-    X86::VPANDQZ128rmbk,  X86::VPANDDZ128rmbk  },
-  { X86::VORPSZ128rmbk,   X86::VORPDZ128rmbk,
-    X86::VPORQZ128rmbk,   X86::VPORDZ128rmbk   },
-  { X86::VXORPSZ128rmbk,  X86::VXORPDZ128rmbk,
-    X86::VPXORQZ128rmbk,  X86::VPXORDZ128rmbk  },
-  { X86::VANDNPSZ256rmbk, X86::VANDNPDZ256rmbk,
-    X86::VPANDNQZ256rmbk, X86::VPANDNDZ256rmbk },
-  { X86::VANDPSZ256rmbk,  X86::VANDPDZ256rmbk,
-    X86::VPANDQZ256rmbk,  X86::VPANDDZ256rmbk  },
-  { X86::VORPSZ256rmbk,   X86::VORPDZ256rmbk,
-    X86::VPORQZ256rmbk,   X86::VPORDZ256rmbk   },
-  { X86::VXORPSZ256rmbk,  X86::VXORPDZ256rmbk,
-    X86::VPXORQZ256rmbk,  X86::VPXORDZ256rmbk  },
-  { X86::VANDNPSZrmbk,    X86::VANDNPDZrmbk,
-    X86::VPANDNQZrmbk,    X86::VPANDNDZrmbk    },
-  { X86::VANDPSZrmbk,     X86::VANDPDZrmbk,
-    X86::VPANDQZrmbk,     X86::VPANDDZrmbk     },
-  { X86::VANDPSZrmbk,     X86::VANDPDZrmbk,
-    X86::VPANDQZrmbk,     X86::VPANDDZrmbk     },
-  { X86::VORPSZrmbk,      X86::VORPDZrmbk,
-    X86::VPORQZrmbk,      X86::VPORDZrmbk      },
-  { X86::VXORPSZrmbk,     X86::VXORPDZrmbk,
-    X86::VPXORQZrmbk,     X86::VPXORDZrmbk     },
-  { X86::VANDNPSZ128rmbkz,X86::VANDNPDZ128rmbkz,
-    X86::VPANDNQZ128rmbkz,X86::VPANDNDZ128rmbkz},
-  { X86::VANDPSZ128rmbkz, X86::VANDPDZ128rmbkz,
-    X86::VPANDQZ128rmbkz, X86::VPANDDZ128rmbkz },
-  { X86::VORPSZ128rmbkz,  X86::VORPDZ128rmbkz,
-    X86::VPORQZ128rmbkz,  X86::VPORDZ128rmbkz  },
-  { X86::VXORPSZ128rmbkz, X86::VXORPDZ128rmbkz,
-    X86::VPXORQZ128rmbkz, X86::VPXORDZ128rmbkz },
-  { X86::VANDNPSZ256rmbkz,X86::VANDNPDZ256rmbkz,
-    X86::VPANDNQZ256rmbkz,X86::VPANDNDZ256rmbkz},
-  { X86::VANDPSZ256rmbkz, X86::VANDPDZ256rmbkz,
-    X86::VPANDQZ256rmbkz, X86::VPANDDZ256rmbkz },
-  { X86::VORPSZ256rmbkz,  X86::VORPDZ256rmbkz,
-    X86::VPORQZ256rmbkz,  X86::VPORDZ256rmbkz  },
-  { X86::VXORPSZ256rmbkz, X86::VXORPDZ256rmbkz,
-    X86::VPXORQZ256rmbkz, X86::VPXORDZ256rmbkz },
-  { X86::VANDNPSZrmbkz,   X86::VANDNPDZrmbkz,
-    X86::VPANDNQZrmbkz,   X86::VPANDNDZrmbkz   },
-  { X86::VANDPSZrmbkz,    X86::VANDPDZrmbkz,
-    X86::VPANDQZrmbkz,    X86::VPANDDZrmbkz    },
-  { X86::VANDPSZrmbkz,    X86::VANDPDZrmbkz,
-    X86::VPANDQZrmbkz,    X86::VPANDDZrmbkz    },
-  { X86::VORPSZrmbkz,     X86::VORPDZrmbkz,
-    X86::VPORQZrmbkz,     X86::VPORDZrmbkz     },
-  { X86::VXORPSZrmbkz,    X86::VXORPDZrmbkz,
-    X86::VPXORQZrmbkz,    X86::VPXORDZrmbkz    },
-};
-
-// NOTE: These should only be used by the custom domain methods.
-static const uint16_t ReplaceableBlendInstrs[][3] = {
-  //PackedSingle             PackedDouble             PackedInt
-  { X86::BLENDPSrmi,         X86::BLENDPDrmi,         X86::PBLENDWrmi   },
-  { X86::BLENDPSrri,         X86::BLENDPDrri,         X86::PBLENDWrri   },
-  { X86::VBLENDPSrmi,        X86::VBLENDPDrmi,        X86::VPBLENDWrmi  },
-  { X86::VBLENDPSrri,        X86::VBLENDPDrri,        X86::VPBLENDWrri  },
-  { X86::VBLENDPSYrmi,       X86::VBLENDPDYrmi,       X86::VPBLENDWYrmi },
-  { X86::VBLENDPSYrri,       X86::VBLENDPDYrri,       X86::VPBLENDWYrri },
-};
-static const uint16_t ReplaceableBlendAVX2Instrs[][3] = {
-  //PackedSingle             PackedDouble             PackedInt
-  { X86::VBLENDPSrmi,        X86::VBLENDPDrmi,        X86::VPBLENDDrmi  },
-  { X86::VBLENDPSrri,        X86::VBLENDPDrri,        X86::VPBLENDDrri  },
-  { X86::VBLENDPSYrmi,       X86::VBLENDPDYrmi,       X86::VPBLENDDYrmi },
-  { X86::VBLENDPSYrri,       X86::VBLENDPDYrri,       X86::VPBLENDDYrri },
-};
-
-// Special table for changing EVEX logic instructions to VEX.
-// TODO: Should we run EVEX->VEX earlier?
-static const uint16_t ReplaceableCustomAVX512LogicInstrs[][4] = {
-  // Two integer columns for 64-bit and 32-bit elements.
-  //PackedSingle     PackedDouble     PackedInt           PackedInt
-  { X86::VANDNPSrm,  X86::VANDNPDrm,  X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
-  { X86::VANDNPSrr,  X86::VANDNPDrr,  X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
-  { X86::VANDPSrm,   X86::VANDPDrm,   X86::VPANDQZ128rm,  X86::VPANDDZ128rm  },
-  { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDQZ128rr,  X86::VPANDDZ128rr  },
-  { X86::VORPSrm,    X86::VORPDrm,    X86::VPORQZ128rm,   X86::VPORDZ128rm   },
-  { X86::VORPSrr,    X86::VORPDrr,    X86::VPORQZ128rr,   X86::VPORDZ128rr   },
-  { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORQZ128rm,  X86::VPXORDZ128rm  },
-  { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORQZ128rr,  X86::VPXORDZ128rr  },
-  { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
-  { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
-  { X86::VANDPSYrm,  X86::VANDPDYrm,  X86::VPANDQZ256rm,  X86::VPANDDZ256rm  },
-  { X86::VANDPSYrr,  X86::VANDPDYrr,  X86::VPANDQZ256rr,  X86::VPANDDZ256rr  },
-  { X86::VORPSYrm,   X86::VORPDYrm,   X86::VPORQZ256rm,   X86::VPORDZ256rm   },
-  { X86::VORPSYrr,   X86::VORPDYrr,   X86::VPORQZ256rr,   X86::VPORDZ256rr   },
-  { X86::VXORPSYrm,  X86::VXORPDYrm,  X86::VPXORQZ256rm,  X86::VPXORDZ256rm  },
-  { X86::VXORPSYrr,  X86::VXORPDYrr,  X86::VPXORQZ256rr,  X86::VPXORDZ256rr  },
-};
-
 // FIXME: Some shuffle and unpack instructions have equivalents in 
diff erent
 // domains, but they require a bit more work than just switching opcodes.
 
 static const uint16_t *lookup(unsigned opcode, unsigned domain,
                               ArrayRef<uint16_t[3]> Table) {
-  for (const uint16_t (&Row)[3] : Table)
-    if (Row[domain-1] == opcode)
+  for (const uint16_t(&Row)[3] : Table)
+    if (Row[domain - 1] == opcode)
       return Row;
   return nullptr;
 }
@@ -8392,8 +8740,8 @@ static const uint16_t *lookup(unsigned opcode, unsigned domain,
 static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
                                     ArrayRef<uint16_t[4]> Table) {
   // If this is the integer domain make sure to check both integer columns.
-  for (const uint16_t (&Row)[4] : Table)
-    if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode))
+  for (const uint16_t(&Row)[4] : Table)
+    if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
       return Row;
   return nullptr;
 }
@@ -8477,22 +8825,38 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
   case X86::VPBLENDWYrmi:
   case X86::VPBLENDWYrri:
     return GetBlendDomains(8, false);
-  case X86::VPANDDZ128rr:  case X86::VPANDDZ128rm:
-  case X86::VPANDDZ256rr:  case X86::VPANDDZ256rm:
-  case X86::VPANDQZ128rr:  case X86::VPANDQZ128rm:
-  case X86::VPANDQZ256rr:  case X86::VPANDQZ256rm:
-  case X86::VPANDNDZ128rr: case X86::VPANDNDZ128rm:
-  case X86::VPANDNDZ256rr: case X86::VPANDNDZ256rm:
-  case X86::VPANDNQZ128rr: case X86::VPANDNQZ128rm:
-  case X86::VPANDNQZ256rr: case X86::VPANDNQZ256rm:
-  case X86::VPORDZ128rr:   case X86::VPORDZ128rm:
-  case X86::VPORDZ256rr:   case X86::VPORDZ256rm:
-  case X86::VPORQZ128rr:   case X86::VPORQZ128rm:
-  case X86::VPORQZ256rr:   case X86::VPORQZ256rm:
-  case X86::VPXORDZ128rr:  case X86::VPXORDZ128rm:
-  case X86::VPXORDZ256rr:  case X86::VPXORDZ256rm:
-  case X86::VPXORQZ128rr:  case X86::VPXORQZ128rm:
-  case X86::VPXORQZ256rr:  case X86::VPXORQZ256rm:
+  case X86::VPANDDZ128rr:
+  case X86::VPANDDZ128rm:
+  case X86::VPANDDZ256rr:
+  case X86::VPANDDZ256rm:
+  case X86::VPANDQZ128rr:
+  case X86::VPANDQZ128rm:
+  case X86::VPANDQZ256rr:
+  case X86::VPANDQZ256rm:
+  case X86::VPANDNDZ128rr:
+  case X86::VPANDNDZ128rm:
+  case X86::VPANDNDZ256rr:
+  case X86::VPANDNDZ256rm:
+  case X86::VPANDNQZ128rr:
+  case X86::VPANDNQZ128rm:
+  case X86::VPANDNQZ256rr:
+  case X86::VPANDNQZ256rm:
+  case X86::VPORDZ128rr:
+  case X86::VPORDZ128rm:
+  case X86::VPORDZ256rr:
+  case X86::VPORDZ256rm:
+  case X86::VPORQZ128rr:
+  case X86::VPORQZ128rm:
+  case X86::VPORQZ256rr:
+  case X86::VPORQZ256rm:
+  case X86::VPXORDZ128rr:
+  case X86::VPXORDZ128rm:
+  case X86::VPXORDZ256rr:
+  case X86::VPXORDZ256rm:
+  case X86::VPXORQZ128rr:
+  case X86::VPXORQZ128rm:
+  case X86::VPXORQZ256rr:
+  case X86::VPXORQZ256rm:
     // If we don't have DQI see if we can still switch from an EVEX integer
     // instruction to a VEX floating point instruction.
     if (Subtarget.hasDQI())
@@ -8518,8 +8882,7 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
     // both inputs.
     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
         MI.getOperand(0).getSubReg() == 0 &&
-        MI.getOperand(1).getSubReg() == 0 &&
-        MI.getOperand(2).getSubReg() == 0)
+        MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0)
       return 0x6;
     return 0;
   case X86::SHUFPDrri:
@@ -8528,6 +8891,8 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
   return 0;
 }
 
+#include "X86ReplaceableInstrs.def"
+
 bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
                                             unsigned Domain) const {
   assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
@@ -8600,28 +8965,44 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
   case X86::VPBLENDWYrmi:
   case X86::VPBLENDWYrri:
     return SetBlendDomain(16, true);
-  case X86::VPANDDZ128rr:  case X86::VPANDDZ128rm:
-  case X86::VPANDDZ256rr:  case X86::VPANDDZ256rm:
-  case X86::VPANDQZ128rr:  case X86::VPANDQZ128rm:
-  case X86::VPANDQZ256rr:  case X86::VPANDQZ256rm:
-  case X86::VPANDNDZ128rr: case X86::VPANDNDZ128rm:
-  case X86::VPANDNDZ256rr: case X86::VPANDNDZ256rm:
-  case X86::VPANDNQZ128rr: case X86::VPANDNQZ128rm:
-  case X86::VPANDNQZ256rr: case X86::VPANDNQZ256rm:
-  case X86::VPORDZ128rr:   case X86::VPORDZ128rm:
-  case X86::VPORDZ256rr:   case X86::VPORDZ256rm:
-  case X86::VPORQZ128rr:   case X86::VPORQZ128rm:
-  case X86::VPORQZ256rr:   case X86::VPORQZ256rm:
-  case X86::VPXORDZ128rr:  case X86::VPXORDZ128rm:
-  case X86::VPXORDZ256rr:  case X86::VPXORDZ256rm:
-  case X86::VPXORQZ128rr:  case X86::VPXORQZ128rm:
-  case X86::VPXORQZ256rr:  case X86::VPXORQZ256rm: {
+  case X86::VPANDDZ128rr:
+  case X86::VPANDDZ128rm:
+  case X86::VPANDDZ256rr:
+  case X86::VPANDDZ256rm:
+  case X86::VPANDQZ128rr:
+  case X86::VPANDQZ128rm:
+  case X86::VPANDQZ256rr:
+  case X86::VPANDQZ256rm:
+  case X86::VPANDNDZ128rr:
+  case X86::VPANDNDZ128rm:
+  case X86::VPANDNDZ256rr:
+  case X86::VPANDNDZ256rm:
+  case X86::VPANDNQZ128rr:
+  case X86::VPANDNQZ128rm:
+  case X86::VPANDNQZ256rr:
+  case X86::VPANDNQZ256rm:
+  case X86::VPORDZ128rr:
+  case X86::VPORDZ128rm:
+  case X86::VPORDZ256rr:
+  case X86::VPORDZ256rm:
+  case X86::VPORQZ128rr:
+  case X86::VPORQZ128rm:
+  case X86::VPORQZ256rr:
+  case X86::VPORQZ256rm:
+  case X86::VPXORDZ128rr:
+  case X86::VPXORDZ128rm:
+  case X86::VPXORDZ256rr:
+  case X86::VPXORDZ256rm:
+  case X86::VPXORQZ128rr:
+  case X86::VPXORQZ128rm:
+  case X86::VPXORQZ256rr:
+  case X86::VPXORQZ256rm: {
     // Without DQI, convert EVEX instructions to VEX instructions.
     if (Subtarget.hasDQI())
       return false;
 
-    const uint16_t *table = lookupAVX512(MI.getOpcode(), dom,
-                                         ReplaceableCustomAVX512LogicInstrs);
+    const uint16_t *table =
+        lookupAVX512(MI.getOpcode(), dom, ReplaceableCustomAVX512LogicInstrs);
     assert(table && "Instruction not found in table?");
     // Don't change integer Q instructions to D instructions and
     // use D intructions if we started with a PS instruction.
@@ -8649,8 +9030,10 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
     if (Domain == 1) {
       unsigned Imm = MI.getOperand(3).getImm();
       unsigned NewImm = 0x44;
-      if (Imm & 1) NewImm |= 0x0a;
-      if (Imm & 2) NewImm |= 0xa0;
+      if (Imm & 1)
+        NewImm |= 0x0a;
+      if (Imm & 2)
+        NewImm |= 0xa0;
       MI.getOperand(3).setImm(NewImm);
       MI.setDesc(get(X86::SHUFPSrri));
     }
@@ -8685,12 +9068,12 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
       validDomains = 0xe;
     } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
       validDomains = 0xe;
-    } else if (Subtarget.hasDQI() && lookupAVX512(opcode, domain,
-                                                  ReplaceableInstrsAVX512DQ)) {
+    } else if (Subtarget.hasDQI() &&
+               lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
       validDomains = 0xe;
     } else if (Subtarget.hasDQI()) {
-      if (const uint16_t *table = lookupAVX512(opcode, domain,
-                                             ReplaceableInstrsAVX512DQMasked)) {
+      if (const uint16_t *table =
+              lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) {
         if (domain == 1 || (domain == 3 && table[3] == opcode))
           validDomains = 0xa;
         else
@@ -8702,7 +9085,7 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
 }
 
 void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
-  assert(Domain>0 && Domain<4 && "Invalid execution domain");
+  assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
   uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   assert(dom && "Not an SSE instruction");
 
@@ -8766,7 +9149,8 @@ MCInst X86InstrInfo::getNop() const {
 
 bool X86InstrInfo::isHighLatencyDef(int opc) const {
   switch (opc) {
-  default: return false;
+  default:
+    return false;
   case X86::DIVPDrm:
   case X86::DIVPDrr:
   case X86::DIVPSrm:
@@ -9095,8 +9479,7 @@ bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
   // instructions that depend on the exact status flags (zero, sign, etc.)
   // that are set by using these particular operands with this operation.
   const MachineOperand *FlagDef = Inst.findRegisterDefOperand(X86::EFLAGS);
-  assert((Inst.getNumDefs() == 1 || FlagDef) &&
-         "Implicit def isn't flags?");
+  assert((Inst.getNumDefs() == 1 || FlagDef) && "Implicit def isn't flags?");
   if (FlagDef && !FlagDef->isDead())
     return false;
 
@@ -9679,230 +10062,228 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
 }
 
 namespace {
-  /// Create Global Base Reg pass. This initializes the PIC
-  /// global base register for x86-32.
-  struct CGBR : public MachineFunctionPass {
-    static char ID;
-    CGBR() : MachineFunctionPass(ID) {}
-
-    bool runOnMachineFunction(MachineFunction &MF) override {
-      const X86TargetMachine *TM =
+/// Create Global Base Reg pass. This initializes the PIC
+/// global base register for x86-32.
+struct CGBR : public MachineFunctionPass {
+  static char ID;
+  CGBR() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    const X86TargetMachine *TM =
         static_cast<const X86TargetMachine *>(&MF.getTarget());
-      const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+    const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
 
-      // Don't do anything in the 64-bit small and kernel code models. They use
-      // RIP-relative addressing for everything.
-      if (STI.is64Bit() && (TM->getCodeModel() == CodeModel::Small ||
-                            TM->getCodeModel() == CodeModel::Kernel))
-        return false;
+    // Don't do anything in the 64-bit small and kernel code models. They use
+    // RIP-relative addressing for everything.
+    if (STI.is64Bit() && (TM->getCodeModel() == CodeModel::Small ||
+                          TM->getCodeModel() == CodeModel::Kernel))
+      return false;
 
-      // Only emit a global base reg in PIC mode.
-      if (!TM->isPositionIndependent())
-        return false;
+    // Only emit a global base reg in PIC mode.
+    if (!TM->isPositionIndependent())
+      return false;
 
-      X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-      Register GlobalBaseReg = X86FI->getGlobalBaseReg();
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    Register GlobalBaseReg = X86FI->getGlobalBaseReg();
 
-      // If we didn't need a GlobalBaseReg, don't insert code.
-      if (GlobalBaseReg == 0)
-        return false;
+    // If we didn't need a GlobalBaseReg, don't insert code.
+    if (GlobalBaseReg == 0)
+      return false;
 
-      // Insert the set of GlobalBaseReg into the first MBB of the function
-      MachineBasicBlock &FirstMBB = MF.front();
-      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
-      DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
-      MachineRegisterInfo &RegInfo = MF.getRegInfo();
-      const X86InstrInfo *TII = STI.getInstrInfo();
+    // Insert the set of GlobalBaseReg into the first MBB of the function
+    MachineBasicBlock &FirstMBB = MF.front();
+    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+    const X86InstrInfo *TII = STI.getInstrInfo();
 
-      Register PC;
-      if (STI.isPICStyleGOT())
-        PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
-      else
-        PC = GlobalBaseReg;
-
-      if (STI.is64Bit()) {
-        if (TM->getCodeModel() == CodeModel::Medium) {
-          // In the medium code model, use a RIP-relative LEA to materialize the
-          // GOT.
-          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC)
-              .addReg(X86::RIP)
-              .addImm(0)
-              .addReg(0)
-              .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
-              .addReg(0);
-        } else if (TM->getCodeModel() == CodeModel::Large) {
-          // In the large code model, we are aiming for this code, though the
-          // register allocation may vary:
-          //   leaq .LN$pb(%rip), %rax
-          //   movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
-          //   addq %rcx, %rax
-          // RAX now holds address of _GLOBAL_OFFSET_TABLE_.
-          Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
-          Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
-          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
-              .addReg(X86::RIP)
-              .addImm(0)
-              .addReg(0)
-              .addSym(MF.getPICBaseSymbol())
-              .addReg(0);
-          std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol());
-          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg)
-              .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
-                                 X86II::MO_PIC_BASE_OFFSET);
-          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC)
-              .addReg(PBReg, RegState::Kill)
-              .addReg(GOTReg, RegState::Kill);
-        } else {
-          llvm_unreachable("unexpected code model");
-        }
+    Register PC;
+    if (STI.isPICStyleGOT())
+      PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+    else
+      PC = GlobalBaseReg;
+
+    if (STI.is64Bit()) {
+      if (TM->getCodeModel() == CodeModel::Medium) {
+        // In the medium code model, use a RIP-relative LEA to materialize the
+        // GOT.
+        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC)
+            .addReg(X86::RIP)
+            .addImm(0)
+            .addReg(0)
+            .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
+            .addReg(0);
+      } else if (TM->getCodeModel() == CodeModel::Large) {
+        // In the large code model, we are aiming for this code, though the
+        // register allocation may vary:
+        //   leaq .LN$pb(%rip), %rax
+        //   movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
+        //   addq %rcx, %rax
+        // RAX now holds address of _GLOBAL_OFFSET_TABLE_.
+        Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
+        Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
+        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
+            .addReg(X86::RIP)
+            .addImm(0)
+            .addReg(0)
+            .addSym(MF.getPICBaseSymbol())
+            .addReg(0);
+        std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol());
+        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg)
+            .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+                               X86II::MO_PIC_BASE_OFFSET);
+        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC)
+            .addReg(PBReg, RegState::Kill)
+            .addReg(GOTReg, RegState::Kill);
       } else {
-        // Operand of MovePCtoStack is completely ignored by asm printer. It's
-        // only used in JIT code emission as displacement to pc.
-        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
-
-        // If we're using vanilla 'GOT' PIC style, we should use relative
-        // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
-        if (STI.isPICStyleGOT()) {
-          // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel],
-          // %some_register
-          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
-              .addReg(PC)
-              .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
-                                 X86II::MO_GOT_ABSOLUTE_ADDRESS);
-        }
+        llvm_unreachable("unexpected code model");
+      }
+    } else {
+      // Operand of MovePCtoStack is completely ignored by asm printer. It's
+      // only used in JIT code emission as displacement to pc.
+      BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+      // If we're using vanilla 'GOT' PIC style, we should use relative
+      // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+      if (STI.isPICStyleGOT()) {
+        // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel],
+        // %some_register
+        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+            .addReg(PC)
+            .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+                               X86II::MO_GOT_ABSOLUTE_ADDRESS);
       }
-
-      return true;
     }
 
-    StringRef getPassName() const override {
-      return "X86 PIC Global Base Reg Initialization";
-    }
+    return true;
+  }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-  };
+  StringRef getPassName() const override {
+    return "X86 PIC Global Base Reg Initialization";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
 } // namespace
 
 char CGBR::ID = 0;
-FunctionPass*
-llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
+FunctionPass *llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
 
 namespace {
-  struct LDTLSCleanup : public MachineFunctionPass {
-    static char ID;
-    LDTLSCleanup() : MachineFunctionPass(ID) {}
+struct LDTLSCleanup : public MachineFunctionPass {
+  static char ID;
+  LDTLSCleanup() : MachineFunctionPass(ID) {}
 
-    bool runOnMachineFunction(MachineFunction &MF) override {
-      if (skipFunction(MF.getFunction()))
-        return false;
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
 
-      X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
-        // No point folding accesses if there isn't at least two.
-        return false;
+    X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+    if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+      // No point folding accesses if there isn't at least two.
+      return false;
+    }
+
+    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+    return VisitNode(DT->getRootNode(), 0);
+  }
+
+  // Visit the dominator subtree rooted at Node in pre-order.
+  // If TLSBaseAddrReg is non-null, then use that to replace any
+  // TLS_base_addr instructions. Otherwise, create the register
+  // when the first such instruction is seen, and then use it
+  // as we encounter more instructions.
+  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+    MachineBasicBlock *BB = Node->getBlock();
+    bool Changed = false;
+
+    // Traverse the current block.
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+         ++I) {
+      switch (I->getOpcode()) {
+      case X86::TLS_base_addr32:
+      case X86::TLS_base_addr64:
+        if (TLSBaseAddrReg)
+          I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
+        else
+          I = SetRegister(*I, &TLSBaseAddrReg);
+        Changed = true;
+        break;
+      default:
+        break;
       }
+    }
 
-      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
-      return VisitNode(DT->getRootNode(), 0);
+    // Visit the children of this block in the dominator tree.
+    for (auto &I : *Node) {
+      Changed |= VisitNode(I, TLSBaseAddrReg);
     }
 
-    // Visit the dominator subtree rooted at Node in pre-order.
-    // If TLSBaseAddrReg is non-null, then use that to replace any
-    // TLS_base_addr instructions. Otherwise, create the register
-    // when the first such instruction is seen, and then use it
-    // as we encounter more instructions.
-    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
-      MachineBasicBlock *BB = Node->getBlock();
-      bool Changed = false;
-
-      // Traverse the current block.
-      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
-           ++I) {
-        switch (I->getOpcode()) {
-          case X86::TLS_base_addr32:
-          case X86::TLS_base_addr64:
-            if (TLSBaseAddrReg)
-              I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
-            else
-              I = SetRegister(*I, &TLSBaseAddrReg);
-            Changed = true;
-            break;
-          default:
-            break;
-        }
-      }
+    return Changed;
+  }
 
-      // Visit the children of this block in the dominator tree.
-      for (auto &I : *Node) {
-        Changed |= VisitNode(I, TLSBaseAddrReg);
-      }
+  // Replace the TLS_base_addr instruction I with a copy from
+  // TLSBaseAddrReg, returning the new instruction.
+  MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
+                                       unsigned TLSBaseAddrReg) {
+    MachineFunction *MF = I.getParent()->getParent();
+    const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+    const bool is64Bit = STI.is64Bit();
+    const X86InstrInfo *TII = STI.getInstrInfo();
 
-      return Changed;
-    }
+    // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+    MachineInstr *Copy =
+        BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
+            .addReg(TLSBaseAddrReg);
 
-    // Replace the TLS_base_addr instruction I with a copy from
-    // TLSBaseAddrReg, returning the new instruction.
-    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
-                                         unsigned TLSBaseAddrReg) {
-      MachineFunction *MF = I.getParent()->getParent();
-      const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
-      const bool is64Bit = STI.is64Bit();
-      const X86InstrInfo *TII = STI.getInstrInfo();
-
-      // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
-      MachineInstr *Copy =
-          BuildMI(*I.getParent(), I, I.getDebugLoc(),
-                  TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
-              .addReg(TLSBaseAddrReg);
-
-      // Erase the TLS_base_addr instruction.
-      I.eraseFromParent();
-
-      return Copy;
-    }
+    // Erase the TLS_base_addr instruction.
+    I.eraseFromParent();
 
-    // Create a virtual register in *TLSBaseAddrReg, and populate it by
-    // inserting a copy instruction after I. Returns the new instruction.
-    MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
-      MachineFunction *MF = I.getParent()->getParent();
-      const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
-      const bool is64Bit = STI.is64Bit();
-      const X86InstrInfo *TII = STI.getInstrInfo();
-
-      // Create a virtual register for the TLS base address.
-      MachineRegisterInfo &RegInfo = MF->getRegInfo();
-      *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
-                                                      ? &X86::GR64RegClass
-                                                      : &X86::GR32RegClass);
-
-      // Insert a copy from RAX/EAX to TLSBaseAddrReg.
-      MachineInstr *Next = I.getNextNode();
-      MachineInstr *Copy =
-          BuildMI(*I.getParent(), Next, I.getDebugLoc(),
-                  TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
-              .addReg(is64Bit ? X86::RAX : X86::EAX);
-
-      return Copy;
-    }
+    return Copy;
+  }
 
-    StringRef getPassName() const override {
-      return "Local Dynamic TLS Access Clean-up";
-    }
+  // Create a virtual register in *TLSBaseAddrReg, and populate it by
+  // inserting a copy instruction after I. Returns the new instruction.
+  MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
+    MachineFunction *MF = I.getParent()->getParent();
+    const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+    const bool is64Bit = STI.is64Bit();
+    const X86InstrInfo *TII = STI.getInstrInfo();
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<MachineDominatorTree>();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-  };
-}
+    // Create a virtual register for the TLS base address.
+    MachineRegisterInfo &RegInfo = MF->getRegInfo();
+    *TLSBaseAddrReg = RegInfo.createVirtualRegister(
+        is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass);
+
+    // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+    MachineInstr *Next = I.getNextNode();
+    MachineInstr *Copy = BuildMI(*I.getParent(), Next, I.getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+                             .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+    return Copy;
+  }
+
+  StringRef getPassName() const override {
+    return "Local Dynamic TLS Access Clean-up";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // namespace
 
 char LDTLSCleanup::ID = 0;
-FunctionPass*
-llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
+FunctionPass *llvm::createCleanupLocalDynamicTLSPass() {
+  return new LDTLSCleanup();
+}
 
 /// Constants defining how certain sequences should be outlined.
 ///
@@ -9932,10 +10313,7 @@ llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
 /// * Call construction overhead: 1 (jump instruction)
 /// * Frame construction overhead: 0 (don't need to return)
 ///
-enum MachineOutlinerClass {
-  MachineOutlinerDefault,
-  MachineOutlinerTailCall
-};
+enum MachineOutlinerClass { MachineOutlinerDefault, MachineOutlinerTailCall };
 
 std::optional<outliner::OutlinedFunction>
 X86InstrInfo::getOutliningCandidateInfo(
@@ -9995,8 +10373,8 @@ X86InstrInfo::getOutliningCandidateInfo(
                                     MachineOutlinerDefault);
 }
 
-bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
-                                           bool OutlineFromLinkOnceODRs) const {
+bool X86InstrInfo::isFunctionSafeToOutlineFrom(
+    MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
   const Function &F = MF.getFunction();
 
   // Does the function use a red zone? If it does, then we can't risk messing
@@ -10011,14 +10389,15 @@ bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
   // If we *don't* want to outline from things that could potentially be deduped
   // then return false.
   if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
-      return false;
+    return false;
 
   // This function is viable for outlining, so return true.
   return true;
 }
 
 outliner::InstrType
-X86InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,  unsigned Flags) const {
+X86InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
+                                   unsigned Flags) const {
   MachineInstr &MI = *MIT;
 
   // Is this a terminator for a basic block?
@@ -10054,10 +10433,9 @@ X86InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,  unsigned F
   return outliner::InstrType::Legal;
 }
 
-void X86InstrInfo::buildOutlinedFrame(MachineBasicBlock &MBB,
-                                          MachineFunction &MF,
-                                          const outliner::OutlinedFunction &OF)
-                                          const {
+void X86InstrInfo::buildOutlinedFrame(
+    MachineBasicBlock &MBB, MachineFunction &MF,
+    const outliner::OutlinedFunction &OF) const {
   // If we're a tail call, we already have a return, so don't do anything.
   if (OF.FrameConstructionID == MachineOutlinerTailCall)
     return;
@@ -10068,22 +10446,18 @@ void X86InstrInfo::buildOutlinedFrame(MachineBasicBlock &MBB,
   MBB.insert(MBB.end(), retq);
 }
 
-MachineBasicBlock::iterator
-X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator &It,
-                                 MachineFunction &MF,
-                                 outliner::Candidate &C) const {
+MachineBasicBlock::iterator X86InstrInfo::insertOutlinedCall(
+    Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
+    MachineFunction &MF, outliner::Candidate &C) const {
   // Is it a tail call?
   if (C.CallConstructionID == MachineOutlinerTailCall) {
     // Yes, just insert a JMP.
-    It = MBB.insert(It,
-                  BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64))
-                      .addGlobalAddress(M.getNamedValue(MF.getName())));
+    It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64))
+                            .addGlobalAddress(M.getNamedValue(MF.getName())));
   } else {
     // No, insert a call.
-    It = MBB.insert(It,
-                  BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
-                      .addGlobalAddress(M.getNamedValue(MF.getName())));
+    It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
+                            .addGlobalAddress(M.getNamedValue(MF.getName())));
   }
 
   return It;
@@ -10120,8 +10494,8 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
 
     // PXOR is safe to use because it doesn't affect flags.
     BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
-      .addReg(Reg, RegState::Undef)
-      .addReg(Reg, RegState::Undef);
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
   } else if (X86::VR256RegClass.contains(Reg)) {
     // YMM#
     if (!ST.hasAVX())
@@ -10129,8 +10503,8 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
 
     // VPXOR is safe to use because it doesn't affect flags.
     BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
-      .addReg(Reg, RegState::Undef)
-      .addReg(Reg, RegState::Undef);
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
   } else if (X86::VR512RegClass.contains(Reg)) {
     // ZMM#
     if (!ST.hasAVX512())
@@ -10138,12 +10512,10 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
 
     // VPXORY is safe to use because it doesn't affect flags.
     BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
-      .addReg(Reg, RegState::Undef)
-      .addReg(Reg, RegState::Undef);
-  } else if (X86::VK1RegClass.contains(Reg) ||
-             X86::VK2RegClass.contains(Reg) ||
-             X86::VK4RegClass.contains(Reg) ||
-             X86::VK8RegClass.contains(Reg) ||
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
+  } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
+             X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
              X86::VK16RegClass.contains(Reg)) {
     if (!ST.hasVLX())
       return;

diff  --git a/llvm/lib/Target/X86/X86ReplaceableInstrs.def b/llvm/lib/Target/X86/X86ReplaceableInstrs.def
new file mode 100644
index 000000000000000..4798275c051923a
--- /dev/null
+++ b/llvm/lib/Target/X86/X86ReplaceableInstrs.def
@@ -0,0 +1,426 @@
+//===- X86ReplaceableInstrs.def ----------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+
+#define ENTRY(A, B, C) {X86::A, X86::B, X86::C},
+static const uint16_t ReplaceableInstrs[][3] = {
+// PackedSingle, PackedDouble, PackedInt
+ENTRY(MOVAPSmr, MOVAPDmr, MOVDQAmr)
+ENTRY(MOVAPSrm, MOVAPDrm, MOVDQArm)
+ENTRY(MOVAPSrr, MOVAPDrr, MOVDQArr)
+ENTRY(MOVUPSmr, MOVUPDmr, MOVDQUmr)
+ENTRY(MOVUPSrm, MOVUPDrm, MOVDQUrm)
+ENTRY(MOVLPSmr, MOVLPDmr, MOVPQI2QImr)
+ENTRY(MOVSDmr, MOVSDmr, MOVPQI2QImr)
+ENTRY(MOVSSmr, MOVSSmr, MOVPDI2DImr)
+ENTRY(MOVSDrm, MOVSDrm, MOVQI2PQIrm)
+ENTRY(MOVSDrm_alt, MOVSDrm_alt, MOVQI2PQIrm)
+ENTRY(MOVSSrm, MOVSSrm, MOVDI2PDIrm)
+ENTRY(MOVSSrm_alt, MOVSSrm_alt, MOVDI2PDIrm)
+ENTRY(MOVNTPSmr, MOVNTPDmr, MOVNTDQmr)
+ENTRY(ANDNPSrm, ANDNPDrm, PANDNrm)
+ENTRY(ANDNPSrr, ANDNPDrr, PANDNrr)
+ENTRY(ANDPSrm, ANDPDrm, PANDrm)
+ENTRY(ANDPSrr, ANDPDrr, PANDrr)
+ENTRY(ORPSrm, ORPDrm, PORrm)
+ENTRY(ORPSrr, ORPDrr, PORrr)
+ENTRY(XORPSrm, XORPDrm, PXORrm)
+ENTRY(XORPSrr, XORPDrr, PXORrr)
+ENTRY(UNPCKLPDrm, UNPCKLPDrm, PUNPCKLQDQrm)
+ENTRY(MOVLHPSrr, UNPCKLPDrr, PUNPCKLQDQrr)
+ENTRY(UNPCKHPDrm, UNPCKHPDrm, PUNPCKHQDQrm)
+ENTRY(UNPCKHPDrr, UNPCKHPDrr, PUNPCKHQDQrr)
+ENTRY(UNPCKLPSrm, UNPCKLPSrm, PUNPCKLDQrm)
+ENTRY(UNPCKLPSrr, UNPCKLPSrr, PUNPCKLDQrr)
+ENTRY(UNPCKHPSrm, UNPCKHPSrm, PUNPCKHDQrm)
+ENTRY(UNPCKHPSrr, UNPCKHPSrr, PUNPCKHDQrr)
+ENTRY(EXTRACTPSmr, EXTRACTPSmr, PEXTRDmr)
+ENTRY(EXTRACTPSrr, EXTRACTPSrr, PEXTRDrr)
+// AVX 128-bit support
+ENTRY(VMOVAPSmr, VMOVAPDmr, VMOVDQAmr)
+ENTRY(VMOVAPSrm, VMOVAPDrm, VMOVDQArm)
+ENTRY(VMOVAPSrr, VMOVAPDrr, VMOVDQArr)
+ENTRY(VMOVUPSmr, VMOVUPDmr, VMOVDQUmr)
+ENTRY(VMOVUPSrm, VMOVUPDrm, VMOVDQUrm)
+ENTRY(VMOVLPSmr, VMOVLPDmr, VMOVPQI2QImr)
+ENTRY(VMOVSDmr, VMOVSDmr, VMOVPQI2QImr)
+ENTRY(VMOVSSmr, VMOVSSmr, VMOVPDI2DImr)
+ENTRY(VMOVSDrm, VMOVSDrm, VMOVQI2PQIrm)
+ENTRY(VMOVSDrm_alt, VMOVSDrm_alt, VMOVQI2PQIrm)
+ENTRY(VMOVSSrm, VMOVSSrm, VMOVDI2PDIrm)
+ENTRY(VMOVSSrm_alt, VMOVSSrm_alt, VMOVDI2PDIrm)
+ENTRY(VMOVNTPSmr, VMOVNTPDmr, VMOVNTDQmr)
+ENTRY(VANDNPSrm, VANDNPDrm, VPANDNrm)
+ENTRY(VANDNPSrr, VANDNPDrr, VPANDNrr)
+ENTRY(VANDPSrm, VANDPDrm, VPANDrm)
+ENTRY(VANDPSrr, VANDPDrr, VPANDrr)
+ENTRY(VORPSrm, VORPDrm, VPORrm)
+ENTRY(VORPSrr, VORPDrr, VPORrr)
+ENTRY(VXORPSrm, VXORPDrm, VPXORrm)
+ENTRY(VXORPSrr, VXORPDrr, VPXORrr)
+ENTRY(VUNPCKLPDrm, VUNPCKLPDrm, VPUNPCKLQDQrm)
+ENTRY(VMOVLHPSrr, VUNPCKLPDrr, VPUNPCKLQDQrr)
+ENTRY(VUNPCKHPDrm, VUNPCKHPDrm, VPUNPCKHQDQrm)
+ENTRY(VUNPCKHPDrr, VUNPCKHPDrr, VPUNPCKHQDQrr)
+ENTRY(VUNPCKLPSrm, VUNPCKLPSrm, VPUNPCKLDQrm)
+ENTRY(VUNPCKLPSrr, VUNPCKLPSrr, VPUNPCKLDQrr)
+ENTRY(VUNPCKHPSrm, VUNPCKHPSrm, VPUNPCKHDQrm)
+ENTRY(VUNPCKHPSrr, VUNPCKHPSrr, VPUNPCKHDQrr)
+ENTRY(VEXTRACTPSmr, VEXTRACTPSmr, VPEXTRDmr)
+ENTRY(VEXTRACTPSrr, VEXTRACTPSrr, VPEXTRDrr)
+// AVX 256-bit support
+ENTRY(VMOVAPSYmr, VMOVAPDYmr, VMOVDQAYmr)
+ENTRY(VMOVAPSYrm, VMOVAPDYrm, VMOVDQAYrm)
+ENTRY(VMOVAPSYrr, VMOVAPDYrr, VMOVDQAYrr)
+ENTRY(VMOVUPSYmr, VMOVUPDYmr, VMOVDQUYmr)
+ENTRY(VMOVUPSYrm, VMOVUPDYrm, VMOVDQUYrm)
+ENTRY(VMOVNTPSYmr, VMOVNTPDYmr, VMOVNTDQYmr)
+ENTRY(VPERMPSYrm, VPERMPSYrm, VPERMDYrm)
+ENTRY(VPERMPSYrr, VPERMPSYrr, VPERMDYrr)
+ENTRY(VPERMPDYmi, VPERMPDYmi, VPERMQYmi)
+ENTRY(VPERMPDYri, VPERMPDYri, VPERMQYri)
+// AVX512 support
+ENTRY(VMOVLPSZ128mr, VMOVLPDZ128mr, VMOVPQI2QIZmr)
+ENTRY(VMOVNTPSZ128mr, VMOVNTPDZ128mr, VMOVNTDQZ128mr)
+ENTRY(VMOVNTPSZ256mr, VMOVNTPDZ256mr, VMOVNTDQZ256mr)
+ENTRY(VMOVNTPSZmr, VMOVNTPDZmr, VMOVNTDQZmr)
+ENTRY(VMOVSDZmr, VMOVSDZmr, VMOVPQI2QIZmr)
+ENTRY(VMOVSSZmr, VMOVSSZmr, VMOVPDI2DIZmr)
+ENTRY(VMOVSDZrm, VMOVSDZrm, VMOVQI2PQIZrm)
+ENTRY(VMOVSDZrm_alt, VMOVSDZrm_alt, VMOVQI2PQIZrm)
+ENTRY(VMOVSSZrm, VMOVSSZrm, VMOVDI2PDIZrm)
+ENTRY(VMOVSSZrm_alt, VMOVSSZrm_alt, VMOVDI2PDIZrm)
+ENTRY(VBROADCASTSSZ128rr, VBROADCASTSSZ128rr, VPBROADCASTDZ128rr)
+ENTRY(VBROADCASTSSZ128rm, VBROADCASTSSZ128rm, VPBROADCASTDZ128rm)
+ENTRY(VBROADCASTSSZ256rr, VBROADCASTSSZ256rr, VPBROADCASTDZ256rr)
+ENTRY(VBROADCASTSSZ256rm, VBROADCASTSSZ256rm, VPBROADCASTDZ256rm)
+ENTRY(VBROADCASTSSZrr, VBROADCASTSSZrr, VPBROADCASTDZrr)
+ENTRY(VBROADCASTSSZrm, VBROADCASTSSZrm, VPBROADCASTDZrm)
+ENTRY(VMOVDDUPZ128rr, VMOVDDUPZ128rr, VPBROADCASTQZ128rr)
+ENTRY(VMOVDDUPZ128rm, VMOVDDUPZ128rm, VPBROADCASTQZ128rm)
+ENTRY(VBROADCASTSDZ256rr, VBROADCASTSDZ256rr, VPBROADCASTQZ256rr)
+ENTRY(VBROADCASTSDZ256rm, VBROADCASTSDZ256rm, VPBROADCASTQZ256rm)
+ENTRY(VBROADCASTSDZrr, VBROADCASTSDZrr, VPBROADCASTQZrr)
+ENTRY(VBROADCASTSDZrm, VBROADCASTSDZrm, VPBROADCASTQZrm)
+ENTRY(VINSERTF32x4Zrr, VINSERTF32x4Zrr, VINSERTI32x4Zrr)
+ENTRY(VINSERTF32x4Zrm, VINSERTF32x4Zrm, VINSERTI32x4Zrm)
+ENTRY(VINSERTF32x8Zrr, VINSERTF32x8Zrr, VINSERTI32x8Zrr)
+ENTRY(VINSERTF32x8Zrm, VINSERTF32x8Zrm, VINSERTI32x8Zrm)
+ENTRY(VINSERTF64x2Zrr, VINSERTF64x2Zrr, VINSERTI64x2Zrr)
+ENTRY(VINSERTF64x2Zrm, VINSERTF64x2Zrm, VINSERTI64x2Zrm)
+ENTRY(VINSERTF64x4Zrr, VINSERTF64x4Zrr, VINSERTI64x4Zrr)
+ENTRY(VINSERTF64x4Zrm, VINSERTF64x4Zrm, VINSERTI64x4Zrm)
+ENTRY(VINSERTF32x4Z256rr, VINSERTF32x4Z256rr, VINSERTI32x4Z256rr)
+ENTRY(VINSERTF32x4Z256rm, VINSERTF32x4Z256rm, VINSERTI32x4Z256rm)
+ENTRY(VINSERTF64x2Z256rr, VINSERTF64x2Z256rr, VINSERTI64x2Z256rr)
+ENTRY(VINSERTF64x2Z256rm, VINSERTF64x2Z256rm, VINSERTI64x2Z256rm)
+ENTRY(VEXTRACTF32x4Zrr, VEXTRACTF32x4Zrr, VEXTRACTI32x4Zrr)
+ENTRY(VEXTRACTF32x4Zmr, VEXTRACTF32x4Zmr, VEXTRACTI32x4Zmr)
+ENTRY(VEXTRACTF32x8Zrr, VEXTRACTF32x8Zrr, VEXTRACTI32x8Zrr)
+ENTRY(VEXTRACTF32x8Zmr, VEXTRACTF32x8Zmr, VEXTRACTI32x8Zmr)
+ENTRY(VEXTRACTF64x2Zrr, VEXTRACTF64x2Zrr, VEXTRACTI64x2Zrr)
+ENTRY(VEXTRACTF64x2Zmr, VEXTRACTF64x2Zmr, VEXTRACTI64x2Zmr)
+ENTRY(VEXTRACTF64x4Zrr, VEXTRACTF64x4Zrr, VEXTRACTI64x4Zrr)
+ENTRY(VEXTRACTF64x4Zmr, VEXTRACTF64x4Zmr, VEXTRACTI64x4Zmr)
+ENTRY(VEXTRACTF32x4Z256rr, VEXTRACTF32x4Z256rr, VEXTRACTI32x4Z256rr)
+ENTRY(VEXTRACTF32x4Z256mr, VEXTRACTF32x4Z256mr, VEXTRACTI32x4Z256mr)
+ENTRY(VEXTRACTF64x2Z256rr, VEXTRACTF64x2Z256rr, VEXTRACTI64x2Z256rr)
+ENTRY(VEXTRACTF64x2Z256mr, VEXTRACTF64x2Z256mr, VEXTRACTI64x2Z256mr)
+ENTRY(VPERMILPSmi, VPERMILPSmi, VPSHUFDmi)
+ENTRY(VPERMILPSri, VPERMILPSri, VPSHUFDri)
+ENTRY(VPERMILPSZ128mi, VPERMILPSZ128mi, VPSHUFDZ128mi)
+ENTRY(VPERMILPSZ128ri, VPERMILPSZ128ri, VPSHUFDZ128ri)
+ENTRY(VPERMILPSZ256mi, VPERMILPSZ256mi, VPSHUFDZ256mi)
+ENTRY(VPERMILPSZ256ri, VPERMILPSZ256ri, VPSHUFDZ256ri)
+ENTRY(VPERMILPSZmi, VPERMILPSZmi, VPSHUFDZmi)
+ENTRY(VPERMILPSZri, VPERMILPSZri, VPSHUFDZri)
+ENTRY(VPERMPSZ256rm, VPERMPSZ256rm, VPERMDZ256rm)
+ENTRY(VPERMPSZ256rr, VPERMPSZ256rr, VPERMDZ256rr)
+ENTRY(VPERMPDZ256mi, VPERMPDZ256mi, VPERMQZ256mi)
+ENTRY(VPERMPDZ256ri, VPERMPDZ256ri, VPERMQZ256ri)
+ENTRY(VPERMPDZ256rm, VPERMPDZ256rm, VPERMQZ256rm)
+ENTRY(VPERMPDZ256rr, VPERMPDZ256rr, VPERMQZ256rr)
+ENTRY(VPERMPSZrm, VPERMPSZrm, VPERMDZrm)
+ENTRY(VPERMPSZrr, VPERMPSZrr, VPERMDZrr)
+ENTRY(VPERMPDZmi, VPERMPDZmi, VPERMQZmi)
+ENTRY(VPERMPDZri, VPERMPDZri, VPERMQZri)
+ENTRY(VPERMPDZrm, VPERMPDZrm, VPERMQZrm)
+ENTRY(VPERMPDZrr, VPERMPDZrr, VPERMQZrr)
+ENTRY(VUNPCKLPDZ256rm, VUNPCKLPDZ256rm, VPUNPCKLQDQZ256rm)
+ENTRY(VUNPCKLPDZ256rr, VUNPCKLPDZ256rr, VPUNPCKLQDQZ256rr)
+ENTRY(VUNPCKHPDZ256rm, VUNPCKHPDZ256rm, VPUNPCKHQDQZ256rm)
+ENTRY(VUNPCKHPDZ256rr, VUNPCKHPDZ256rr, VPUNPCKHQDQZ256rr)
+ENTRY(VUNPCKLPSZ256rm, VUNPCKLPSZ256rm, VPUNPCKLDQZ256rm)
+ENTRY(VUNPCKLPSZ256rr, VUNPCKLPSZ256rr, VPUNPCKLDQZ256rr)
+ENTRY(VUNPCKHPSZ256rm, VUNPCKHPSZ256rm, VPUNPCKHDQZ256rm)
+ENTRY(VUNPCKHPSZ256rr, VUNPCKHPSZ256rr, VPUNPCKHDQZ256rr)
+ENTRY(VUNPCKLPDZ128rm, VUNPCKLPDZ128rm, VPUNPCKLQDQZ128rm)
+ENTRY(VMOVLHPSZrr, VUNPCKLPDZ128rr, VPUNPCKLQDQZ128rr)
+ENTRY(VUNPCKHPDZ128rm, VUNPCKHPDZ128rm, VPUNPCKHQDQZ128rm)
+ENTRY(VUNPCKHPDZ128rr, VUNPCKHPDZ128rr, VPUNPCKHQDQZ128rr)
+ENTRY(VUNPCKLPSZ128rm, VUNPCKLPSZ128rm, VPUNPCKLDQZ128rm)
+ENTRY(VUNPCKLPSZ128rr, VUNPCKLPSZ128rr, VPUNPCKLDQZ128rr)
+ENTRY(VUNPCKHPSZ128rm, VUNPCKHPSZ128rm, VPUNPCKHDQZ128rm)
+ENTRY(VUNPCKHPSZ128rr, VUNPCKHPSZ128rr, VPUNPCKHDQZ128rr)
+ENTRY(VUNPCKLPDZrm, VUNPCKLPDZrm, VPUNPCKLQDQZrm)
+ENTRY(VUNPCKLPDZrr, VUNPCKLPDZrr, VPUNPCKLQDQZrr)
+ENTRY(VUNPCKHPDZrm, VUNPCKHPDZrm, VPUNPCKHQDQZrm)
+ENTRY(VUNPCKHPDZrr, VUNPCKHPDZrr, VPUNPCKHQDQZrr)
+ENTRY(VUNPCKLPSZrm, VUNPCKLPSZrm, VPUNPCKLDQZrm)
+ENTRY(VUNPCKLPSZrr, VUNPCKLPSZrr, VPUNPCKLDQZrr)
+ENTRY(VUNPCKHPSZrm, VUNPCKHPSZrm, VPUNPCKHDQZrm)
+ENTRY(VUNPCKHPSZrr, VUNPCKHPSZrr, VPUNPCKHDQZrr)
+ENTRY(VEXTRACTPSZmr, VEXTRACTPSZmr, VPEXTRDZmr)
+ENTRY(VEXTRACTPSZrr, VEXTRACTPSZrr, VPEXTRDZrr)
+};
+
+static const uint16_t ReplaceableInstrsAVX2[][3] = {
+// PackedSingle, PackedDouble, PackedInt
+ENTRY(VANDNPSYrm, VANDNPDYrm, VPANDNYrm)
+ENTRY(VANDNPSYrr, VANDNPDYrr, VPANDNYrr)
+ENTRY(VANDPSYrm, VANDPDYrm, VPANDYrm)
+ENTRY(VANDPSYrr, VANDPDYrr, VPANDYrr)
+ENTRY(VORPSYrm, VORPDYrm, VPORYrm)
+ENTRY(VORPSYrr, VORPDYrr, VPORYrr)
+ENTRY(VXORPSYrm, VXORPDYrm, VPXORYrm)
+ENTRY(VXORPSYrr, VXORPDYrr, VPXORYrr)
+ENTRY(VPERM2F128rm, VPERM2F128rm, VPERM2I128rm)
+ENTRY(VPERM2F128rr, VPERM2F128rr, VPERM2I128rr)
+ENTRY(VBROADCASTSSrm, VBROADCASTSSrm, VPBROADCASTDrm)
+ENTRY(VBROADCASTSSrr, VBROADCASTSSrr, VPBROADCASTDrr)
+ENTRY(VMOVDDUPrm, VMOVDDUPrm, VPBROADCASTQrm)
+ENTRY(VMOVDDUPrr, VMOVDDUPrr, VPBROADCASTQrr)
+ENTRY(VBROADCASTSSYrr, VBROADCASTSSYrr, VPBROADCASTDYrr)
+ENTRY(VBROADCASTSSYrm, VBROADCASTSSYrm, VPBROADCASTDYrm)
+ENTRY(VBROADCASTSDYrr, VBROADCASTSDYrr, VPBROADCASTQYrr)
+ENTRY(VBROADCASTSDYrm, VBROADCASTSDYrm, VPBROADCASTQYrm)
+ENTRY(VBROADCASTF128, VBROADCASTF128, VBROADCASTI128)
+ENTRY(VBLENDPSYrri, VBLENDPSYrri, VPBLENDDYrri)
+ENTRY(VBLENDPSYrmi, VBLENDPSYrmi, VPBLENDDYrmi)
+ENTRY(VPERMILPSYmi, VPERMILPSYmi, VPSHUFDYmi)
+ENTRY(VPERMILPSYri, VPERMILPSYri, VPSHUFDYri)
+ENTRY(VUNPCKLPDYrm, VUNPCKLPDYrm, VPUNPCKLQDQYrm)
+ENTRY(VUNPCKLPDYrr, VUNPCKLPDYrr, VPUNPCKLQDQYrr)
+ENTRY(VUNPCKHPDYrm, VUNPCKHPDYrm, VPUNPCKHQDQYrm)
+ENTRY(VUNPCKHPDYrr, VUNPCKHPDYrr, VPUNPCKHQDQYrr)
+ENTRY(VUNPCKLPSYrm, VUNPCKLPSYrm, VPUNPCKLDQYrm)
+ENTRY(VUNPCKLPSYrr, VUNPCKLPSYrr, VPUNPCKLDQYrr)
+ENTRY(VUNPCKHPSYrm, VUNPCKHPSYrm, VPUNPCKHDQYrm)
+ENTRY(VUNPCKHPSYrr, VUNPCKHPSYrr, VPUNPCKHDQYrr)
+};
+
+static const uint16_t ReplaceableInstrsFP[][3] = {
+// PackedSingle, PackedDouble
+ENTRY(MOVLPSrm, MOVLPDrm, INSTRUCTION_LIST_END)
+ENTRY(MOVHPSrm, MOVHPDrm, INSTRUCTION_LIST_END)
+ENTRY(MOVHPSmr, MOVHPDmr, INSTRUCTION_LIST_END)
+ENTRY(VMOVLPSrm, VMOVLPDrm, INSTRUCTION_LIST_END)
+ENTRY(VMOVHPSrm, VMOVHPDrm, INSTRUCTION_LIST_END)
+ENTRY(VMOVHPSmr, VMOVHPDmr, INSTRUCTION_LIST_END)
+ENTRY(VMOVLPSZ128rm, VMOVLPDZ128rm, INSTRUCTION_LIST_END)
+ENTRY(VMOVHPSZ128rm, VMOVHPDZ128rm, INSTRUCTION_LIST_END)
+ENTRY(VMOVHPSZ128mr, VMOVHPDZ128mr, INSTRUCTION_LIST_END)
+};
+
+static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = {
+// PackedSingle, PackedDouble, PackedInt
+ENTRY(VEXTRACTF128mr, VEXTRACTF128mr, VEXTRACTI128mr)
+ENTRY(VEXTRACTF128rr, VEXTRACTF128rr, VEXTRACTI128rr)
+ENTRY(VINSERTF128rm, VINSERTF128rm, VINSERTI128rm)
+ENTRY(VINSERTF128rr, VINSERTF128rr, VINSERTI128rr)
+};
+
+// NOTE: These should only be used by the custom domain methods.
+static const uint16_t ReplaceableBlendInstrs[][3] = {
+//PackedSingle, PackedDouble, PackedInt
+ENTRY(BLENDPSrmi, BLENDPDrmi, PBLENDWrmi)
+ENTRY(BLENDPSrri, BLENDPDrri, PBLENDWrri)
+ENTRY(VBLENDPSrmi, VBLENDPDrmi, VPBLENDWrmi)
+ENTRY(VBLENDPSrri, VBLENDPDrri, VPBLENDWrri)
+ENTRY(VBLENDPSYrmi, VBLENDPDYrmi, VPBLENDWYrmi)
+ENTRY(VBLENDPSYrri, VBLENDPDYrri, VPBLENDWYrri)
+};
+
+static const uint16_t ReplaceableBlendAVX2Instrs[][3] = {
+// PackedSingle, PackedDouble, PackedInt
+ENTRY(VBLENDPSrmi, VBLENDPDrmi, VPBLENDDrmi)
+ENTRY(VBLENDPSrri, VBLENDPDrri, VPBLENDDrri)
+ENTRY(VBLENDPSYrmi, VBLENDPDYrmi, VPBLENDDYrmi)
+ENTRY(VBLENDPSYrri, VBLENDPDYrri, VPBLENDDYrri)
+};
+
+#undef ENTRY
+#define ENTRY(A, B, C, D) {X86::A, X86::B, X86::C, X86::D},
+static const uint16_t ReplaceableInstrsAVX512[][4] = {
+// Two integer columns for 64-bit and 32-bit elements.
+//PackedSingle, PackedDouble, PackedInt, PackedInt
+ENTRY(VMOVAPSZ128mr, VMOVAPDZ128mr, VMOVDQA64Z128mr, VMOVDQA32Z128mr)
+ENTRY(VMOVAPSZ128rm, VMOVAPDZ128rm, VMOVDQA64Z128rm, VMOVDQA32Z128rm)
+ENTRY(VMOVAPSZ128rr, VMOVAPDZ128rr, VMOVDQA64Z128rr, VMOVDQA32Z128rr)
+ENTRY(VMOVUPSZ128mr, VMOVUPDZ128mr, VMOVDQU64Z128mr, VMOVDQU32Z128mr)
+ENTRY(VMOVUPSZ128rm, VMOVUPDZ128rm, VMOVDQU64Z128rm, VMOVDQU32Z128rm)
+ENTRY(VMOVAPSZ256mr, VMOVAPDZ256mr, VMOVDQA64Z256mr, VMOVDQA32Z256mr)
+ENTRY(VMOVAPSZ256rm, VMOVAPDZ256rm, VMOVDQA64Z256rm, VMOVDQA32Z256rm)
+ENTRY(VMOVAPSZ256rr, VMOVAPDZ256rr, VMOVDQA64Z256rr, VMOVDQA32Z256rr)
+ENTRY(VMOVUPSZ256mr, VMOVUPDZ256mr, VMOVDQU64Z256mr, VMOVDQU32Z256mr)
+ENTRY(VMOVUPSZ256rm, VMOVUPDZ256rm, VMOVDQU64Z256rm, VMOVDQU32Z256rm)
+ENTRY(VMOVAPSZmr, VMOVAPDZmr, VMOVDQA64Zmr, VMOVDQA32Zmr)
+ENTRY(VMOVAPSZrm, VMOVAPDZrm, VMOVDQA64Zrm, VMOVDQA32Zrm)
+ENTRY(VMOVAPSZrr, VMOVAPDZrr, VMOVDQA64Zrr, VMOVDQA32Zrr)
+ENTRY(VMOVUPSZmr, VMOVUPDZmr, VMOVDQU64Zmr, VMOVDQU32Zmr)
+ENTRY(VMOVUPSZrm, VMOVUPDZrm, VMOVDQU64Zrm, VMOVDQU32Zrm)
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQ[][4] = {
+// Two integer columns for 64-bit and 32-bit elements.
+// PackedSingle, PackedDouble, PackedInt, PackedInt
+ENTRY(VANDNPSZ128rm, VANDNPDZ128rm, VPANDNQZ128rm, VPANDNDZ128rm)
+ENTRY(VANDNPSZ128rr, VANDNPDZ128rr, VPANDNQZ128rr, VPANDNDZ128rr)
+ENTRY(VANDPSZ128rm, VANDPDZ128rm, VPANDQZ128rm, VPANDDZ128rm)
+ENTRY(VANDPSZ128rr, VANDPDZ128rr, VPANDQZ128rr, VPANDDZ128rr)
+ENTRY(VORPSZ128rm, VORPDZ128rm, VPORQZ128rm, VPORDZ128rm)
+ENTRY(VORPSZ128rr, VORPDZ128rr, VPORQZ128rr, VPORDZ128rr)
+ENTRY(VXORPSZ128rm, VXORPDZ128rm, VPXORQZ128rm, VPXORDZ128rm)
+ENTRY(VXORPSZ128rr, VXORPDZ128rr, VPXORQZ128rr, VPXORDZ128rr)
+ENTRY(VANDNPSZ256rm, VANDNPDZ256rm, VPANDNQZ256rm, VPANDNDZ256rm)
+ENTRY(VANDNPSZ256rr, VANDNPDZ256rr, VPANDNQZ256rr, VPANDNDZ256rr)
+ENTRY(VANDPSZ256rm, VANDPDZ256rm, VPANDQZ256rm, VPANDDZ256rm)
+ENTRY(VANDPSZ256rr, VANDPDZ256rr, VPANDQZ256rr, VPANDDZ256rr)
+ENTRY(VORPSZ256rm, VORPDZ256rm, VPORQZ256rm, VPORDZ256rm)
+ENTRY(VORPSZ256rr, VORPDZ256rr, VPORQZ256rr, VPORDZ256rr)
+ENTRY(VXORPSZ256rm, VXORPDZ256rm, VPXORQZ256rm, VPXORDZ256rm)
+ENTRY(VXORPSZ256rr, VXORPDZ256rr, VPXORQZ256rr, VPXORDZ256rr)
+ENTRY(VANDNPSZrm, VANDNPDZrm, VPANDNQZrm, VPANDNDZrm)
+ENTRY(VANDNPSZrr, VANDNPDZrr, VPANDNQZrr, VPANDNDZrr)
+ENTRY(VANDPSZrm, VANDPDZrm, VPANDQZrm, VPANDDZrm)
+ENTRY(VANDPSZrr, VANDPDZrr, VPANDQZrr, VPANDDZrr)
+ENTRY(VORPSZrm, VORPDZrm, VPORQZrm, VPORDZrm)
+ENTRY(VORPSZrr, VORPDZrr, VPORQZrr, VPORDZrr)
+ENTRY(VXORPSZrm, VXORPDZrm, VPXORQZrm, VPXORDZrm)
+ENTRY(VXORPSZrr, VXORPDZrr, VPXORQZrr, VPXORDZrr)
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
+// Two integer columns for 64-bit and 32-bit elements.
+// PackedSingle, PackedDouble, PackedInt, PackedInt
+ENTRY(VANDNPSZ128rmk, VANDNPDZ128rmk, VPANDNQZ128rmk, VPANDNDZ128rmk)
+ENTRY(VANDNPSZ128rmkz, VANDNPDZ128rmkz, VPANDNQZ128rmkz, VPANDNDZ128rmkz)
+ENTRY(VANDNPSZ128rrk, VANDNPDZ128rrk, VPANDNQZ128rrk, VPANDNDZ128rrk)
+ENTRY(VANDNPSZ128rrkz, VANDNPDZ128rrkz, VPANDNQZ128rrkz, VPANDNDZ128rrkz)
+ENTRY(VANDPSZ128rmk, VANDPDZ128rmk, VPANDQZ128rmk, VPANDDZ128rmk)
+ENTRY(VANDPSZ128rmkz, VANDPDZ128rmkz, VPANDQZ128rmkz, VPANDDZ128rmkz)
+ENTRY(VANDPSZ128rrk, VANDPDZ128rrk, VPANDQZ128rrk, VPANDDZ128rrk)
+ENTRY(VANDPSZ128rrkz, VANDPDZ128rrkz, VPANDQZ128rrkz, VPANDDZ128rrkz)
+ENTRY(VORPSZ128rmk, VORPDZ128rmk, VPORQZ128rmk, VPORDZ128rmk)
+ENTRY(VORPSZ128rmkz, VORPDZ128rmkz, VPORQZ128rmkz, VPORDZ128rmkz)
+ENTRY(VORPSZ128rrk, VORPDZ128rrk, VPORQZ128rrk, VPORDZ128rrk)
+ENTRY(VORPSZ128rrkz, VORPDZ128rrkz, VPORQZ128rrkz, VPORDZ128rrkz)
+ENTRY(VXORPSZ128rmk, VXORPDZ128rmk, VPXORQZ128rmk, VPXORDZ128rmk)
+ENTRY(VXORPSZ128rmkz, VXORPDZ128rmkz, VPXORQZ128rmkz, VPXORDZ128rmkz)
+ENTRY(VXORPSZ128rrk, VXORPDZ128rrk, VPXORQZ128rrk, VPXORDZ128rrk)
+ENTRY(VXORPSZ128rrkz, VXORPDZ128rrkz, VPXORQZ128rrkz, VPXORDZ128rrkz)
+ENTRY(VANDNPSZ256rmk, VANDNPDZ256rmk, VPANDNQZ256rmk, VPANDNDZ256rmk)
+ENTRY(VANDNPSZ256rmkz, VANDNPDZ256rmkz, VPANDNQZ256rmkz, VPANDNDZ256rmkz)
+ENTRY(VANDNPSZ256rrk, VANDNPDZ256rrk, VPANDNQZ256rrk, VPANDNDZ256rrk)
+ENTRY(VANDNPSZ256rrkz, VANDNPDZ256rrkz, VPANDNQZ256rrkz, VPANDNDZ256rrkz)
+ENTRY(VANDPSZ256rmk, VANDPDZ256rmk, VPANDQZ256rmk, VPANDDZ256rmk)
+ENTRY(VANDPSZ256rmkz, VANDPDZ256rmkz, VPANDQZ256rmkz, VPANDDZ256rmkz)
+ENTRY(VANDPSZ256rrk, VANDPDZ256rrk, VPANDQZ256rrk, VPANDDZ256rrk)
+ENTRY(VANDPSZ256rrkz, VANDPDZ256rrkz, VPANDQZ256rrkz, VPANDDZ256rrkz)
+ENTRY(VORPSZ256rmk, VORPDZ256rmk, VPORQZ256rmk, VPORDZ256rmk)
+ENTRY(VORPSZ256rmkz, VORPDZ256rmkz, VPORQZ256rmkz, VPORDZ256rmkz)
+ENTRY(VORPSZ256rrk, VORPDZ256rrk, VPORQZ256rrk, VPORDZ256rrk)
+ENTRY(VORPSZ256rrkz, VORPDZ256rrkz, VPORQZ256rrkz, VPORDZ256rrkz)
+ENTRY(VXORPSZ256rmk, VXORPDZ256rmk, VPXORQZ256rmk, VPXORDZ256rmk)
+ENTRY(VXORPSZ256rmkz, VXORPDZ256rmkz, VPXORQZ256rmkz, VPXORDZ256rmkz)
+ENTRY(VXORPSZ256rrk, VXORPDZ256rrk, VPXORQZ256rrk, VPXORDZ256rrk)
+ENTRY(VXORPSZ256rrkz, VXORPDZ256rrkz, VPXORQZ256rrkz, VPXORDZ256rrkz)
+ENTRY(VANDNPSZrmk, VANDNPDZrmk, VPANDNQZrmk, VPANDNDZrmk)
+ENTRY(VANDNPSZrmkz, VANDNPDZrmkz, VPANDNQZrmkz, VPANDNDZrmkz)
+ENTRY(VANDNPSZrrk, VANDNPDZrrk, VPANDNQZrrk, VPANDNDZrrk)
+ENTRY(VANDNPSZrrkz, VANDNPDZrrkz, VPANDNQZrrkz, VPANDNDZrrkz)
+ENTRY(VANDPSZrmk, VANDPDZrmk, VPANDQZrmk, VPANDDZrmk)
+ENTRY(VANDPSZrmkz, VANDPDZrmkz, VPANDQZrmkz, VPANDDZrmkz)
+ENTRY(VANDPSZrrk, VANDPDZrrk, VPANDQZrrk, VPANDDZrrk)
+ENTRY(VANDPSZrrkz, VANDPDZrrkz, VPANDQZrrkz, VPANDDZrrkz)
+ENTRY(VORPSZrmk, VORPDZrmk, VPORQZrmk, VPORDZrmk)
+ENTRY(VORPSZrmkz, VORPDZrmkz, VPORQZrmkz, VPORDZrmkz)
+ENTRY(VORPSZrrk, VORPDZrrk, VPORQZrrk, VPORDZrrk)
+ENTRY(VORPSZrrkz, VORPDZrrkz, VPORQZrrkz, VPORDZrrkz)
+ENTRY(VXORPSZrmk, VXORPDZrmk, VPXORQZrmk, VPXORDZrmk)
+ENTRY(VXORPSZrmkz, VXORPDZrmkz, VPXORQZrmkz, VPXORDZrmkz)
+ENTRY(VXORPSZrrk, VXORPDZrrk, VPXORQZrrk, VPXORDZrrk)
+ENTRY(VXORPSZrrkz, VXORPDZrrkz, VPXORQZrrkz, VPXORDZrrkz)
+// Broadcast loads can be handled the same as masked operations to avoid
+// changing element size.
+ENTRY(VANDNPSZ128rmb, VANDNPDZ128rmb, VPANDNQZ128rmb, VPANDNDZ128rmb)
+ENTRY(VANDPSZ128rmb, VANDPDZ128rmb, VPANDQZ128rmb, VPANDDZ128rmb)
+ENTRY(VORPSZ128rmb, VORPDZ128rmb, VPORQZ128rmb, VPORDZ128rmb)
+ENTRY(VXORPSZ128rmb, VXORPDZ128rmb, VPXORQZ128rmb, VPXORDZ128rmb)
+ENTRY(VANDNPSZ256rmb, VANDNPDZ256rmb, VPANDNQZ256rmb, VPANDNDZ256rmb)
+ENTRY(VANDPSZ256rmb, VANDPDZ256rmb, VPANDQZ256rmb, VPANDDZ256rmb)
+ENTRY(VORPSZ256rmb, VORPDZ256rmb, VPORQZ256rmb, VPORDZ256rmb)
+ENTRY(VXORPSZ256rmb, VXORPDZ256rmb, VPXORQZ256rmb, VPXORDZ256rmb)
+ENTRY(VANDNPSZrmb, VANDNPDZrmb, VPANDNQZrmb, VPANDNDZrmb)
+ENTRY(VANDPSZrmb, VANDPDZrmb, VPANDQZrmb, VPANDDZrmb)
+ENTRY(VANDPSZrmb, VANDPDZrmb, VPANDQZrmb, VPANDDZrmb)
+ENTRY(VORPSZrmb, VORPDZrmb, VPORQZrmb, VPORDZrmb)
+ENTRY(VXORPSZrmb, VXORPDZrmb, VPXORQZrmb, VPXORDZrmb)
+ENTRY(VANDNPSZ128rmbk, VANDNPDZ128rmbk, VPANDNQZ128rmbk, VPANDNDZ128rmbk)
+ENTRY(VANDPSZ128rmbk, VANDPDZ128rmbk, VPANDQZ128rmbk, VPANDDZ128rmbk)
+ENTRY(VORPSZ128rmbk, VORPDZ128rmbk, VPORQZ128rmbk, VPORDZ128rmbk)
+ENTRY(VXORPSZ128rmbk, VXORPDZ128rmbk, VPXORQZ128rmbk, VPXORDZ128rmbk)
+ENTRY(VANDNPSZ256rmbk, VANDNPDZ256rmbk, VPANDNQZ256rmbk, VPANDNDZ256rmbk)
+ENTRY(VANDPSZ256rmbk, VANDPDZ256rmbk, VPANDQZ256rmbk, VPANDDZ256rmbk)
+ENTRY(VORPSZ256rmbk, VORPDZ256rmbk, VPORQZ256rmbk, VPORDZ256rmbk)
+ENTRY(VXORPSZ256rmbk, VXORPDZ256rmbk, VPXORQZ256rmbk, VPXORDZ256rmbk)
+ENTRY(VANDNPSZrmbk, VANDNPDZrmbk, VPANDNQZrmbk, VPANDNDZrmbk)
+ENTRY(VANDPSZrmbk, VANDPDZrmbk, VPANDQZrmbk, VPANDDZrmbk)
+ENTRY(VANDPSZrmbk, VANDPDZrmbk, VPANDQZrmbk, VPANDDZrmbk)
+ENTRY(VORPSZrmbk, VORPDZrmbk, VPORQZrmbk, VPORDZrmbk)
+ENTRY(VXORPSZrmbk, VXORPDZrmbk, VPXORQZrmbk, VPXORDZrmbk)
+ENTRY(VANDNPSZ128rmbkz, VANDNPDZ128rmbkz, VPANDNQZ128rmbkz, VPANDNDZ128rmbkz)
+ENTRY(VANDPSZ128rmbkz, VANDPDZ128rmbkz, VPANDQZ128rmbkz, VPANDDZ128rmbkz)
+ENTRY(VORPSZ128rmbkz, VORPDZ128rmbkz, VPORQZ128rmbkz, VPORDZ128rmbkz)
+ENTRY(VXORPSZ128rmbkz, VXORPDZ128rmbkz, VPXORQZ128rmbkz, VPXORDZ128rmbkz)
+ENTRY(VANDNPSZ256rmbkz, VANDNPDZ256rmbkz, VPANDNQZ256rmbkz, VPANDNDZ256rmbkz)
+ENTRY(VANDPSZ256rmbkz, VANDPDZ256rmbkz, VPANDQZ256rmbkz, VPANDDZ256rmbkz)
+ENTRY(VORPSZ256rmbkz, VORPDZ256rmbkz, VPORQZ256rmbkz, VPORDZ256rmbkz)
+ENTRY(VXORPSZ256rmbkz, VXORPDZ256rmbkz, VPXORQZ256rmbkz, VPXORDZ256rmbkz)
+ENTRY(VANDNPSZrmbkz, VANDNPDZrmbkz, VPANDNQZrmbkz, VPANDNDZrmbkz)
+ENTRY(VANDPSZrmbkz, VANDPDZrmbkz, VPANDQZrmbkz, VPANDDZrmbkz)
+ENTRY(VANDPSZrmbkz, VANDPDZrmbkz, VPANDQZrmbkz, VPANDDZrmbkz)
+ENTRY(VORPSZrmbkz, VORPDZrmbkz, VPORQZrmbkz, VPORDZrmbkz)
+ENTRY(VXORPSZrmbkz, VXORPDZrmbkz, VPXORQZrmbkz, VPXORDZrmbkz)
+};
+
+// Special table for changing EVEX logic instructions to VEX.
+// TODO: Should we run EVEX->VEX earlier?
+static const uint16_t ReplaceableCustomAVX512LogicInstrs[][4] = {
+// Two integer columns for 64-bit and 32-bit elements.
+// PackedSingle, PackedDouble, PackedInt, PackedInt
+ENTRY(VANDNPSrm, VANDNPDrm, VPANDNQZ128rm, VPANDNDZ128rm)
+ENTRY(VANDNPSrr, VANDNPDrr, VPANDNQZ128rr, VPANDNDZ128rr)
+ENTRY(VANDPSrm, VANDPDrm, VPANDQZ128rm, VPANDDZ128rm)
+ENTRY(VANDPSrr, VANDPDrr, VPANDQZ128rr, VPANDDZ128rr)
+ENTRY(VORPSrm, VORPDrm, VPORQZ128rm, VPORDZ128rm)
+ENTRY(VORPSrr, VORPDrr, VPORQZ128rr, VPORDZ128rr)
+ENTRY(VXORPSrm, VXORPDrm, VPXORQZ128rm, VPXORDZ128rm)
+ENTRY(VXORPSrr, VXORPDrr, VPXORQZ128rr, VPXORDZ128rr)
+ENTRY(VANDNPSYrm, VANDNPDYrm, VPANDNQZ256rm, VPANDNDZ256rm)
+ENTRY(VANDNPSYrr, VANDNPDYrr, VPANDNQZ256rr, VPANDNDZ256rr)
+ENTRY(VANDPSYrm, VANDPDYrm, VPANDQZ256rm, VPANDDZ256rm)
+ENTRY(VANDPSYrr, VANDPDYrr, VPANDQZ256rr, VPANDDZ256rr)
+ENTRY(VORPSYrm, VORPDYrm, VPORQZ256rm, VPORDZ256rm)
+ENTRY(VORPSYrr, VORPDYrr, VPORQZ256rr, VPORDZ256rr)
+ENTRY(VXORPSYrm, VXORPDYrm, VPXORQZ256rm, VPXORDZ256rm)
+ENTRY(VXORPSYrr, VXORPDYrr, VPXORQZ256rr, VPXORDZ256rr)
+};


        


More information about the llvm-commits mailing list