[llvm] 20013d0 - [X86][AMX] Fix tile config register spill issue.

Thu Jan 21 00:02:17 PST 2021

Author: Luo, Yuanke
Date: 2021-01-21T16:01:50+08:00
New Revision: 20013d02f3352a88d0838eed349abc9a2b0e9cc0

URL: https://github.com/llvm/llvm-project/commit/20013d02f3352a88d0838eed349abc9a2b0e9cc0
DIFF: https://github.com/llvm/llvm-project/commit/20013d02f3352a88d0838eed349abc9a2b0e9cc0.diff

LOG: [X86][AMX] Fix tile config register spill issue.

Previous code build the model that tile config register is the user of
each AMX instruction. There is a problem for the tile config register
spill. When across function, the ldtilecfg instruction may be inserted
on each AMX instruction which use tile config register. This cause all
tile data register clobber.
To fix this issue, we remove the model of tile config register. We
analyze the regmask of call instruction and insert ldtilecfg if there is
any tile data register live across the call. Inserting the sttilecfg
before the call is unneccessary, because the tile config doesn't change
and we can just reload the config.
Besides we also need check tile config register interference. Since we
don't model the config register we should check interference from the
ldtilecfg to each tile data register def.
             ldtilecfg
             /       \
            BB1      BB2
            /         \
           call       BB3
           /           \
       %1=tileload   %2=tilezero
We can start from the instruction of each tile def, and backward to
ldtilecfg. If there is any call instruction, and tile data register is
not preserved, we should insert ldtilecfg after the call instruction.

Differential Revision: https://reviews.llvm.org/D94155

Added: 
    llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll

Modified: 
    llvm/include/llvm/CodeGen/LiveIntervals.h
    llvm/lib/CodeGen/LiveIntervals.cpp
    llvm/lib/Target/X86/X86ExpandPseudo.cpp
    llvm/lib/Target/X86/X86FrameLowering.cpp
    llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
    llvm/lib/Target/X86/X86InstrAMX.td
    llvm/lib/Target/X86/X86InstrInfo.cpp
    llvm/lib/Target/X86/X86PreTileConfig.cpp
    llvm/lib/Target/X86/X86RegisterInfo.td
    llvm/lib/Target/X86/X86TileConfig.cpp
    llvm/test/CodeGen/X86/AMX/amx-across-func.ll
    llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
    llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
    llvm/test/CodeGen/X86/opt-pipeline.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h
index fa08166791b0..8c6f94052295 100644

--- a/llvm/include/llvm/CodeGen/LiveIntervals.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -377,6 +377,13 @@ class VirtRegMap;
     bool checkRegMaskInterference(LiveInterval &LI,
                                   BitVector &UsableRegs);
 
+    /// Get the interferenced slot index and its regmask for an live interval.
+    /// Return false if ther is no interference.
+    bool
+    getInterferenceRegMasks(LiveInterval &LI,
+                            SmallVectorImpl<SlotIndex> &RegSlots,
+                            SmallVectorImpl<const uint32_t *> &RegMaskBits);
+
     // Register unit functions.
     //
     // Fixed interference occurs when MachineInstrs use physregs directly

diff  --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index a32b486240c8..17005b38ac94 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -952,6 +952,56 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,
   }
 }
 
+bool LiveIntervals::getInterferenceRegMasks(
+    LiveInterval &LI, SmallVectorImpl<SlotIndex> &RegSlots,
+    SmallVectorImpl<const uint32_t *> &RegBits) {
+  if (LI.empty())
+    return false;
+  LiveInterval::iterator LiveI = LI.begin(), LiveE = LI.end();
+
+  // Use a smaller arrays for local live ranges.
+  ArrayRef<SlotIndex> Slots;
+  ArrayRef<const uint32_t *> Bits;
+  if (MachineBasicBlock *MBB = intervalIsInOneMBB(LI)) {
+    Slots = getRegMaskSlotsInBlock(MBB->getNumber());
+    Bits = getRegMaskBitsInBlock(MBB->getNumber());
+  } else {
+    Slots = getRegMaskSlots();
+    Bits = getRegMaskBits();
+  }
+
+  // We are going to enumerate all the register mask slots contained in LI.
+  // Start with a binary search of RegMaskSlots to find a starting point.
+  ArrayRef<SlotIndex>::iterator SlotI = llvm::lower_bound(Slots, LiveI->start);
+  ArrayRef<SlotIndex>::iterator SlotE = Slots.end();
+
+  // No slots in range, LI begins after the last call.
+  if (SlotI == SlotE)
+    return false;
+
+  bool Found = false;
+  while (true) {
+    assert(*SlotI >= LiveI->start);
+    // Loop over all slots overlapping this segment.
+    while (*SlotI < LiveI->end) {
+      // *SlotI overlaps LI. Collect mask bits.
+      Found = true;
+      RegSlots.push_back(*SlotI);
+      RegBits.push_back(Bits[SlotI - Slots.begin()]);
+      if (++SlotI == SlotE)
+        return Found;
+    }
+    // *SlotI is beyond the current LI segment.
+    LiveI = LI.advanceTo(LiveI, *SlotI);
+    if (LiveI == LiveE)
+      return Found;
+    // Advance SlotI until it overlaps.
+    while (*SlotI < LiveI->start)
+      if (++SlotI == SlotE)
+        return Found;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //                         IntervalUpdate class.
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 15af0fb2e888..395f437bb648 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -461,25 +461,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   case TargetOpcode::ICALL_BRANCH_FUNNEL:
     ExpandICallBranchFunnel(&MBB, MBBI);
     return true;
-  case X86::PLDTILECFG: {
-    MI.RemoveOperand(0);
-    MI.setDesc(TII->get(X86::LDTILECFG));
-    return true;
-  }
-  case X86::PSTTILECFG: {
-    MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
-    MI.setDesc(TII->get(X86::STTILECFG));
-    return true;
-  }
   case X86::PTILELOADDV: {
-    MI.RemoveOperand(8); // Remove $tmmcfg
     for (unsigned i = 2; i > 0; --i)
       MI.RemoveOperand(i);
     MI.setDesc(TII->get(X86::TILELOADD));
     return true;
   }
   case X86::PTDPBSSDV: {
-    MI.RemoveOperand(7); // Remove $tmmcfg
     MI.untieRegOperand(4);
     for (unsigned i = 3; i > 0; --i)
       MI.RemoveOperand(i);
@@ -488,14 +476,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     return true;
   }
   case X86::PTILESTOREDV: {
-    MI.RemoveOperand(8); // Remove $tmmcfg
     for (int i = 1; i >= 0; --i)
       MI.RemoveOperand(i);
     MI.setDesc(TII->get(X86::TILESTORED));
     return true;
   }
   case X86::PTILEZEROV: {
-    for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
+    for (int i = 2; i > 0; --i) // Remove row, col
       MI.RemoveOperand(i);
     MI.setDesc(TII->get(X86::TILEZERO));
     return true;

diff  --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 8339f512158d..fcddfb93b7a3 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2094,8 +2094,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Emit tilerelease for AMX kernel.
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  if (!MRI.reg_nodbg_empty(X86::TMMCFG))
-    BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
+  const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
+  unsigned TileRegNum = RC->getNumRegs();
+  for (unsigned I = 0; I < TileRegNum; I++) {
+    if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) {
+      BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
+      break;
+    }
+  }
 }
 
 StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,

diff  --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index a96f73df855d..302a15701d81 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4606,7 +4606,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       SDValue Index = Node->getOperand(5);
       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
-      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
       SDValue Chain = Node->getOperand(0);
       MachineSDNode *CNode;
       SDValue Ops[] = {Node->getOperand(2),
@@ -4616,7 +4615,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
                        Index,
                        Disp,
                        Segment,
-                       CFG,
                        Chain};
       CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
       ReplaceNode(Node, CNode);
@@ -4627,14 +4625,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         break;
       SDValue Chain = Node->getOperand(0);
       unsigned Opc = X86::PTDPBSSDV;
-      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
       SDValue Ops[] = {Node->getOperand(2),
                        Node->getOperand(3),
                        Node->getOperand(4),
                        Node->getOperand(5),
                        Node->getOperand(6),
                        Node->getOperand(7),
-                       CFG,
                        Chain};
       MachineSDNode *CNode =
           CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
@@ -4646,8 +4642,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         break;
       unsigned Opc = X86::PTILEZEROV;
       SDValue Chain = Node->getOperand(0);
-      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
-      SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
+      SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain};
       MachineSDNode *CNode =
           CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
       ReplaceNode(Node, CNode);
@@ -4718,7 +4713,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       SDValue Index = Node->getOperand(5);
       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
-      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
       SDValue Chain = Node->getOperand(0);
       MachineSDNode *CNode;
       SDValue Ops[] = {Node->getOperand(2),
@@ -4729,7 +4723,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
                        Disp,
                        Segment,
                        Node->getOperand(6),
-                       CFG,
                        Chain};
       CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
       ReplaceNode(Node, CNode);

diff  --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index e4f3290cab9f..209ebd4b3de3 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -48,23 +48,14 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
                      VEX, T8XD;
 
     // Pseduo instruction for RA.
-    let hasSideEffects = 1, mayLoad = 1,
-        Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
-    def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
-
-    let hasSideEffects = 1, mayStore = 1 in
-    def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
-
     def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
                                                       GR16:$src2,
-                                                      opaquemem:$src3,
-                                                      TILECFG:$cfg), []>;
+                                                      opaquemem:$src3), []>;
     def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
                                             GR16:$src2, opaquemem:$src3,
-                                            TILE:$src4, TILECFG:$cfg), []>;
+                                            TILE:$src4), []>;
     def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
-                                                     GR16:$src2,
-                                                     TILECFG:$cfg), []>;
+                                                     GR16:$src2), []>;
 
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.
@@ -104,7 +95,7 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
     let Constraints = "$src4 = $dst" in
     def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
                             GR16:$src2, GR16:$src3, TILE:$src4,
-                            TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
+                            TILE:$src5, TILE:$src6), []>;
 
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.

diff  --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index d9bab14f0c08..fe434bd80f35 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3808,10 +3808,6 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     MachineOperand &MO = NewMI->getOperand(2);
     MO.setReg(VirtReg);
     MO.setIsKill(true);
-  } else if (RC->getID() == X86::TILECFGRegClassID) {
-    unsigned Opc = X86::PSTTILECFG;
-    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
-        .addReg(SrcReg, getKillRegState(isKill));
   } else {
     unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
     bool isAligned =
@@ -3840,10 +3836,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     MachineOperand &MO = NewMI->getOperand(3);
     MO.setReg(VirtReg);
     MO.setIsKill(true);
-  } else if (RC->getID() == X86::TILECFGRegClassID) {
-    unsigned Opc = X86::PLDTILECFG;
-    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
-                      FrameIdx);
   } else {
     const MachineFunction &MF = *MBB.getParent();
     unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);

diff  --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 05ee6c6c8384..a61f9c5cc752 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -38,6 +38,7 @@
 #include "X86InstrBuilder.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -62,8 +63,13 @@ class X86PreTileConfig : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   MachineDominatorTree *DomTree = nullptr;
   MachineRegisterInfo *MRI = nullptr;
+  LiveIntervals *LIS = nullptr;
+  SmallVector<Register, 16> VTileRegs;
+  MachineInstr *TileConfigMI = nullptr;
 
+  void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx);
   MachineInstr *getTileConfigPoint();
+  void reloadTileConfig(int FI);
 
 public:
   X86PreTileConfig() : MachineFunctionPass(ID) {}
@@ -88,20 +94,21 @@ char X86PreTileConfig::ID = 0;
 
 INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
                       "Tile Register Configure", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
                     "Tile Register Configure", false, false)
 
 void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
   AU.addRequired<MachineDominatorTree>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
-                              const TargetInstrInfo *TII,
-                              MachineRegisterInfo *MRI,
-                              const X86Subtarget *ST) {
+void X86PreTileConfig::buildConfigMI(MachineBasicBlock::iterator MI,
+                                     int FrameIdx) {
   auto *MBB = MI->getParent();
 
   // FIXME: AMX should assume AVX512 enabled.
@@ -111,18 +118,15 @@ static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
     BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
         .addReg(Zmm, RegState::Undef)
         .addReg(Zmm, RegState::Undef);
-    addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
-                      FrameIdx)
-        .addReg(Zmm);
+    TileConfigMI = &*addFrameReference(BuildMI(*MBB, MI, DebugLoc(),
+                                               TII->get(X86::VMOVUPSZmr)),
+                                       FrameIdx)
+                         .addReg(Zmm);
   }
 
   // build psuedo ldtilecfg
-  Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
-
-  addFrameReference(
-      BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
-
-  return VReg;
+  addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)),
+                    FrameIdx);
 }
 
 static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
@@ -151,6 +155,7 @@ MachineInstr *X86PreTileConfig::getTileConfigPoint() {
     const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
     if (RC.getID() != X86::TILERegClassID)
       continue;
+    VTileRegs.push_back(VirtReg);
 
     // Find the common dominator for all MI that define tile register.
     for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
@@ -219,23 +224,138 @@ MachineInstr *X86PreTileConfig::getTileConfigPoint() {
   return &*MII;
 }
 
-static void addTileCFGUse(MachineFunction &MF, Register CFG) {
-  for (MachineBasicBlock &MBB : MF) {
+void X86PreTileConfig::reloadTileConfig(int FI) {
+  SmallSet<MachineInstr *, 8> MIVisited;
+  const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
+  auto TileRegNum = RC->getNumRegs();
+
+  for (Register VReg : VTileRegs) {
+    BitVector UsableRegs(TRI->getNumRegs());
+    for (unsigned I = 0; I < TileRegNum; I++)
+      UsableRegs.set(X86::TMM0 + I);
+    SmallVector<SlotIndex, 8> RegSlots;
+    SmallVector<const uint32_t *, 8> RegMasks;
+    LiveInterval &LI = LIS->getInterval(VReg);
+    if (!LIS->getInterferenceRegMasks(LI, RegSlots, RegMasks))
+      continue;
+    for (unsigned I = 0; I < RegSlots.size(); I++) {
+      SlotIndex &SI = RegSlots[I];
+      MachineInstr *MI = LIS->getInstructionFromIndex(SI);
+      // We have reload the tile config register before.
+      if (MIVisited.count(MI))
+        continue;
+      // For inline assembly, we don't reload tile config register.
+      // If there is any ldtilecfg instruction in inline assembly,
+      // it is user's reponsibility to restore everything.
+      if (!MI->isCall())
+        continue;
+      UsableRegs.clearBitsInMask(RegMasks[I]);
+      MIVisited.insert(MI);
+      // There is no interference in callee. This is benifited from
+      // IPRA.
+      if (UsableRegs.none())
+        continue;
+
+      // build psuedo ldtilecfg
+      auto *MBB = MI->getParent();
+      auto MII = MachineBasicBlock::iterator(MI);
+      MII++;
+      addFrameReference(
+          BuildMI(*MBB, *MII, DebugLoc(), TII->get(X86::LDTILECFG)), FI);
+    }
+  }
+  // We just check tile data register interference, we also need check tile
+  // config register interference. Since we don't model the config register
+  // we should check interference from the ldtilecfg to each tile data register
+  // def.
+  //              ldtilecfg
+  //              /       \
+  //             BB1      BB2
+  //             /         \
+  //            call       BB3
+  //            /           \
+  //        %1=tileload   %2=tilezero
+  // We can start from the instruction of each tile def, and backward to
+  // ldtilecfg. If there is any call instruction, and tile data register is
+  // not preserved, we should insert ldtilecfg after the call instruction.
+  SmallSet<MachineBasicBlock *, 8> MBBVisited;
+  for (Register VReg : VTileRegs) {
+    for (MachineOperand &MO : MRI->def_operands(VReg)) {
+      if (MO.isUndef())
+        continue;
+      MachineInstr *MI = MO.getParent();
+      // May be PHI instructiion.
+      // There must be several def tile before PHI instruction.
+      if (MI->isTransient())
+        continue;
+
+      bool Terminate = false;
+      MachineBasicBlock *MBB = MI->getParent();
+      // backward to see if there is any call instruction after ldtilecfg.
+      std::queue<MachineBasicBlock *> WorkList;
+      WorkList.push(MBB);
+      bool First = true;
+      while (!WorkList.empty()) {
+        MBB = WorkList.front();
+        WorkList.pop();
+        // If we have iterate the basic block before, don't iterate it and
+        // its predecessor again. This may be caused by loop, or it has a
+        // cross path from several successor, or it has been iterated when
+        // handle other tile register. In below example, BB1 hit the condition.
+        //               ldtilecfg
+        //                  |
+        //              ---BB1---
+        //              /        \
+        //            BB2        BB3
+        //            /           \
+        //        %1=tileload   %2=tilezero
+        if (MBBVisited.count(MBB))
+          continue;
+        // For the first MBB, we start from the amx instruction which def
+        // tile register.
+        auto I = (First) ? MI->getReverseIterator() : MBB->instr_rbegin();
+        for (auto E = MBB->instr_rend(); I != E; ++I) {
+          // If it is inserted point for ldtilecfg, then we've finished
+          // backward.
+          if (&*I == TileConfigMI) {
+            Terminate = true;
+            break;
+          }
+          if (MIVisited.count(&*I))
+            continue;
+          if (!I->isCall())
+            continue;
+          BitVector UsableRegs(TRI->getNumRegs());
+          for (unsigned I = 0; I < TileRegNum; I++)
+            UsableRegs.set(X86::TMM0 + I);
+          for (MachineOperand &CallMO : I->operands()) {
+            if (CallMO.isRegMask())
+              UsableRegs.clearBitsInMask(CallMO.getRegMask());
+          }
+          // Record the call to avoid double ldtilecfg insert.
+          MIVisited.insert(&*I);
+          if (UsableRegs.none())
+            continue;
+          // Insert ldtilecfg after call instruction.
+          --I;
+          addFrameReference(
+              BuildMI(*MBB, *I, DebugLoc(), TII->get(X86::LDTILECFG)), FI);
+        }
+        // We encounter visited MachineInst, so we don't need to do backward
+        // again.
+        if (Terminate)
+          break;
+        // Next we will iterate its predecessor.
+        for (MachineBasicBlock::pred_iterator S = MBB->pred_begin(),
+                                              E = MBB->pred_end();
+             S != E; S++)
+          WorkList.push(*S);
 
-    // Traverse the basic block.
-    for (MachineInstr &MI : MBB) {
-      unsigned Opcode = MI.getOpcode();
-      switch (Opcode) {
-      default:
-        break;
-      case X86::PTILELOADDV:
-      case X86::PTILESTOREDV:
-      case X86::PTDPBSSDV:
-      case X86::PTILEZEROV:
-        unsigned NumOperands = MI.getNumOperands();
-        MI.RemoveOperand(NumOperands - 1);
-        MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
-        break;
+        // The first the MBB may be visited for the second time when it is in
+        // a loop.
+        if (!First)
+          MBBVisited.insert(MBB);
+        First = false;
       }
     }
   }
@@ -248,15 +368,17 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
   TRI = ST->getRegisterInfo();
   TII = mf.getSubtarget().getInstrInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
+  LIS = &getAnalysis<LiveIntervals>();
 
-  MachineInstr *MI = getTileConfigPoint();
-  if (!MI)
+  auto *TileConfigPoint = getTileConfigPoint();
+  if (!TileConfigPoint)
     return false;
   unsigned Size = ST->getTileConfigSize();
   Align Alignment = ST->getTileConfigAlignment();
   int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
-  Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
-  addTileCFGUse(mf, CFG);
+  buildConfigMI(TileConfigPoint, SS);
+  reloadTileConfig(SS);
+  VTileRegs.clear();
   return true;
 }
 

diff  --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 75cbd4e1cff1..c8723c8268f2 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -639,8 +639,3 @@ def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
 let CopyCost = -1 in // Don't allow copying of tile registers
 def TILE : RegisterClass<"X86", [x86amx], 8192,
                          (sequence "TMM%u", 0, 7)> {let Size = 8192;}
-def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
-  let CopyCost = -1;  // Don't allow copying of tile config registers.
-  let isAllocatable = 1;
-  let Size = 512;
-}

diff  --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp
index ef010bcd38b7..d6c1dcaf0588 100644
--- a/llvm/lib/Target/X86/X86TileConfig.cpp
+++ b/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -22,6 +22,7 @@
 #include "X86MachineFunctionInfo.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -130,13 +131,14 @@ static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
 }
 
 MachineInstr *X86TileConfig::getTileConfigPoint() {
-  for (MachineBasicBlock &MBB : *MF) {
-
-    // Traverse the basic block.
-    for (MachineInstr &MI : MBB)
+  MachineBasicBlock *Entry = &*MF->begin();
+  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(Entry);
+  for (MachineBasicBlock *MBB : RPOT) {
+    for (MachineInstr &MI : *MBB)
       // Refer X86PreTileConfig.cpp.
-      // We only support one tile config for now.
-      if (MI.getOpcode() == X86::PLDTILECFG)
+      // We only support one tile config for now. The other ldtilecfg
+      // is for spill purpose and is dominated by the first ldtilecfg.
+      if (MI.getOpcode() == X86::LDTILECFG)
         return &MI;
   }
 
@@ -148,7 +150,7 @@ void X86TileConfig::tileConfig() {
   if (!MI)
     return;
   MachineBasicBlock *MBB = MI->getParent();
-  int SS = MI->getOperand(1).getIndex();
+  int SS = MI->getOperand(0).getIndex();
   BitVector PhysRegs(TRI->getNumRegs());
 
   // Fill in the palette first.

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
index a68a81b8d732..87973fd9c315 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
@@ -1,10 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
-
-%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
 
 @buf = dso_local global [3072 x i8] zeroinitializer, align 64
 
+define internal void @foo() #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    retq
+;
+; IPRA-LABEL: foo:
+; IPRA:       # %bb.0: # %entry
+; IPRA-NEXT:    pushq %rbp
+; IPRA-NEXT:    .cfi_def_cfa_offset 16
+; IPRA-NEXT:    .cfi_offset %rbp, -16
+; IPRA-NEXT:    movq %rsp, %rbp
+; IPRA-NEXT:    .cfi_def_cfa_register %rbp
+; IPRA-NEXT:    popq %rbp
+; IPRA-NEXT:    .cfi_def_cfa %rsp, 8
+; IPRA-NEXT:    retq
+entry:
+  ret void
+}
+
 define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-LABEL: test_api:
 ; CHECK:       # %bb.0:
@@ -25,7 +50,6 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
 ; CHECK-NEXT:    movl $buf, %eax
 ; CHECK-NEXT:    movl $32, %r14d
 ; CHECK-NEXT:    movw $8, %r15w
@@ -36,11 +60,10 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm2
 ; CHECK-NEXT:    movabsq $64, %rax
 ; CHECK-NEXT:    tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl $buf+2048, %eax
-; CHECK-NEXT:    ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm0
 ; CHECK-NEXT:    movabsq $64, %rcx
 ; CHECK-NEXT:    tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
@@ -55,17 +78,48 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    retq
+;
+; IPRA-LABEL: test_api:
+; IPRA:       # %bb.0:
+; IPRA-NEXT:    pushq %rbp
+; IPRA-NEXT:    subq $64, %rsp
+; IPRA-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; IPRA-NEXT:    vmovdqu64 %zmm0, (%rsp)
+; IPRA-NEXT:    movb $1, (%rsp)
+; IPRA-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movw %si, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movw %si, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    ldtilecfg (%rsp)
+; IPRA-NEXT:    movl $buf, %eax
+; IPRA-NEXT:    movl $32, %ecx
+; IPRA-NEXT:    movw $8, %dx
+; IPRA-NEXT:    tileloadd (%rax,%rcx), %tmm0
+; IPRA-NEXT:    movl $buf+1024, %eax
+; IPRA-NEXT:    tileloadd (%rax,%rcx), %tmm1
+; IPRA-NEXT:    callq foo
+; IPRA-NEXT:    movl $buf+2048, %eax
+; IPRA-NEXT:    tileloadd (%rax,%rcx), %tmm2
+; IPRA-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
+; IPRA-NEXT:    tilestored %tmm2, (%rax,%rcx)
+; IPRA-NEXT:    addq $64, %rsp
+; IPRA-NEXT:    popq %rbp
+; IPRA-NEXT:    tilerelease
+; IPRA-NEXT:    vzeroupper
+; IPRA-NEXT:    retq
   %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
   %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
-  tail call void (...) @foo()
+  call void @foo()
   %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
   %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
   tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
   ret void
 }
 
-declare dso_local void @foo(...)
-
 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
+
+attributes #0 = { noinline nounwind optnone uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
index a415d9c15242..f38554b9f79d 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
@@ -5,7 +5,6 @@ define void @test_amx() {
 ; CHECK-LABEL: test_amx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    tdpbf16ps %tmm7, %tmm4, %tmm3
-; CHECK-NEXT:    retq
   call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7)
   ret void
 }

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
new file mode 100644
index 000000000000..b381429c9374
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+ at buf = dso_local global [3072 x i8] zeroinitializer, align 16
+
+define dso_local void @test1(i16 signext %0, i16 signext %1) local_unnamed_addr {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $buf, %eax
+; CHECK-NEXT:    movl $32, %ecx
+; CHECK-NEXT:    movw $8, %dx
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm0
+; CHECK-NEXT:    movl $buf+1024, %eax
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm1
+; CHECK-NEXT:    movl $buf+2048, %eax
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm2
+; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
+; CHECK-NEXT:    tilestored %tmm2, (%rax,%rcx)
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    jmp foo # TAILCALL
+  %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
+  %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
+  %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
+  %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
+  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
+  tail call void @foo()
+  ret void
+}
+
+define dso_local void @test2(i16 signext %0, i16 signext %1) local_unnamed_addr {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    .cfi_offset %rbx, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB1_3
+; CHECK-NEXT:  # %bb.1: # %if.true
+; CHECK-NEXT:    movw $8, %ax
+; CHECK-NEXT:    tilezero %tmm0
+; CHECK-NEXT:    movl $32, %ecx
+; CHECK-NEXT:    movl $buf+1024, %edx
+; CHECK-NEXT:    tileloadd (%rdx,%rcx), %tmm1
+; CHECK-NEXT:    movl $buf+2048, %edx
+; CHECK-NEXT:    tileloadd (%rdx,%rcx), %tmm2
+; CHECK-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:    tilestored %tmm0, (%rdx,%rcx)
+; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:  .LBB1_3: # %if.false
+; CHECK-NEXT:    movl $buf, %eax
+; CHECK-NEXT:    movl $32, %ecx
+; CHECK-NEXT:    movw $8, %dx
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm3
+; CHECK-NEXT:    movl $buf+1024, %eax
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm4
+; CHECK-NEXT:    movl $buf+2048, %eax
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm2
+; CHECK-NEXT:    tdpbssd %tmm2, %tmm4, %tmm3
+; CHECK-NEXT:    tilestored %tmm3, (%rax,%rcx)
+; CHECK-NEXT:  .LBB1_2: # %if.true
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    retq
+  call void @foo()
+  br i1 undef, label %if.true, label %if.false
+
+if.true:
+  %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8)
+  %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
+  %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
+  %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
+  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4)
+  br label %exit
+
+if.false:
+  %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
+  %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
+  %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
+  %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7)
+  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8)
+  br label %exit
+
+exit:
+  ret void
+}
+
+declare dso_local void @foo() local_unnamed_addr
+declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
index 0dc0c34c340c..57b67c456b36 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
@@ -36,11 +36,10 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-NEXT:    tileloadd (%r15,%r14), %tmm5
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
-; CHECK-NEXT:    movl $buf, %eax
-; CHECK-NEXT:    movw $8, %cx
 ; CHECK-NEXT:    jne .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.true
+; CHECK-NEXT:    movl $buf, %eax
+; CHECK-NEXT:    movw $8, %cx
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm0
 ; CHECK-NEXT:    movl $buf+1024, %eax
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm1
@@ -52,11 +51,13 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
-; CHECK-NEXT:    ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movabsq $64, %rax
 ; CHECK-NEXT:    tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
 ; CHECK-NEXT:    jmp .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: # %if.false
+; CHECK-NEXT:    movl $buf, %eax
+; CHECK-NEXT:    movw $8, %cx
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm2
 ; CHECK-NEXT:    movl $buf+1024, %eax
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm3
@@ -68,7 +69,7 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
-; CHECK-NEXT:    ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movabsq $64, %rax
 ; CHECK-NEXT:    tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
 ; CHECK-NEXT:    tilestored %tmm6, (%r15,%r14)
@@ -139,7 +140,6 @@ define dso_local void @test3(i8 *%buf) nounwind {
 ; CHECK-NEXT:    movq %rdi, %rbx
 ; CHECK-NEXT:    movl $32, %r14d
 ; CHECK-NEXT:    xorl %ebp, %ebp
-; CHECK-NEXT:    sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB1_2: # %loop.header
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -149,7 +149,7 @@ define dso_local void @test3(i8 *%buf) nounwind {
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
-; CHECK-NEXT:    ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    tilezero %tmm0
 ; CHECK-NEXT:    tileloadd (%rbx,%r14), %tmm1
 ; CHECK-NEXT:    tileloadd (%rbx,%r14), %tmm2

diff  --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index b851eea60b0a..1e1154b5f759 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -120,6 +120,8 @@
 ; CHECK-NEXT:       X86 EFLAGS copy lowering
 ; CHECK-NEXT:       X86 WinAlloca Expander
 ; CHECK-NEXT:       MachineDominator Tree Construction
+; CHECK-NEXT:       Slot index numbering
+; CHECK-NEXT:       Live Interval Analysis
 ; CHECK-NEXT:       Tile Register Pre-configure
 ; CHECK-NEXT:       Detect Dead Lanes
 ; CHECK-NEXT:       Process Implicit Definitions