[llvm-branch-commits] [llvm] 64132f5 - Revert "[X86][AMX] Fix tile config register spill issue."
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jan 21 02:17:09 PST 2021
Author: Luo, Yuanke
Date: 2021-01-21T18:11:43+08:00
New Revision: 64132f541edd82bffebbd5521e620219743a42eb
URL: https://github.com/llvm/llvm-project/commit/64132f541edd82bffebbd5521e620219743a42eb
DIFF: https://github.com/llvm/llvm-project/commit/64132f541edd82bffebbd5521e620219743a42eb.diff
LOG: Revert "[X86][AMX] Fix tile config register spill issue."
This reverts commit 20013d02f3352a88d0838eed349abc9a2b0e9cc0.
Added:
Modified:
llvm/include/llvm/CodeGen/LiveIntervals.h
llvm/lib/CodeGen/LiveIntervals.cpp
llvm/lib/Target/X86/X86ExpandPseudo.cpp
llvm/lib/Target/X86/X86FrameLowering.cpp
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
llvm/lib/Target/X86/X86InstrAMX.td
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86PreTileConfig.cpp
llvm/lib/Target/X86/X86RegisterInfo.td
llvm/lib/Target/X86/X86TileConfig.cpp
llvm/test/CodeGen/X86/AMX/amx-across-func.ll
llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
llvm/test/CodeGen/X86/opt-pipeline.ll
Removed:
llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
################################################################################
diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h
index 8c6f94052295..fa08166791b0 100644
--- a/llvm/include/llvm/CodeGen/LiveIntervals.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -377,13 +377,6 @@ class VirtRegMap;
bool checkRegMaskInterference(LiveInterval &LI,
BitVector &UsableRegs);
- /// Get the interferenced slot index and its regmask for an live interval.
- /// Return false if ther is no interference.
- bool
- getInterferenceRegMasks(LiveInterval &LI,
- SmallVectorImpl<SlotIndex> &RegSlots,
- SmallVectorImpl<const uint32_t *> &RegMaskBits);
-
// Register unit functions.
//
// Fixed interference occurs when MachineInstrs use physregs directly
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 17005b38ac94..a32b486240c8 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -952,56 +952,6 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,
}
}
-bool LiveIntervals::getInterferenceRegMasks(
- LiveInterval &LI, SmallVectorImpl<SlotIndex> &RegSlots,
- SmallVectorImpl<const uint32_t *> &RegBits) {
- if (LI.empty())
- return false;
- LiveInterval::iterator LiveI = LI.begin(), LiveE = LI.end();
-
- // Use a smaller arrays for local live ranges.
- ArrayRef<SlotIndex> Slots;
- ArrayRef<const uint32_t *> Bits;
- if (MachineBasicBlock *MBB = intervalIsInOneMBB(LI)) {
- Slots = getRegMaskSlotsInBlock(MBB->getNumber());
- Bits = getRegMaskBitsInBlock(MBB->getNumber());
- } else {
- Slots = getRegMaskSlots();
- Bits = getRegMaskBits();
- }
-
- // We are going to enumerate all the register mask slots contained in LI.
- // Start with a binary search of RegMaskSlots to find a starting point.
- ArrayRef<SlotIndex>::iterator SlotI = llvm::lower_bound(Slots, LiveI->start);
- ArrayRef<SlotIndex>::iterator SlotE = Slots.end();
-
- // No slots in range, LI begins after the last call.
- if (SlotI == SlotE)
- return false;
-
- bool Found = false;
- while (true) {
- assert(*SlotI >= LiveI->start);
- // Loop over all slots overlapping this segment.
- while (*SlotI < LiveI->end) {
- // *SlotI overlaps LI. Collect mask bits.
- Found = true;
- RegSlots.push_back(*SlotI);
- RegBits.push_back(Bits[SlotI - Slots.begin()]);
- if (++SlotI == SlotE)
- return Found;
- }
- // *SlotI is beyond the current LI segment.
- LiveI = LI.advanceTo(LiveI, *SlotI);
- if (LiveI == LiveE)
- return Found;
- // Advance SlotI until it overlaps.
- while (*SlotI < LiveI->start)
- if (++SlotI == SlotE)
- return Found;
- }
-}
-
//===----------------------------------------------------------------------===//
// IntervalUpdate class.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 395f437bb648..15af0fb2e888 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -461,13 +461,25 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case TargetOpcode::ICALL_BRANCH_FUNNEL:
ExpandICallBranchFunnel(&MBB, MBBI);
return true;
+ case X86::PLDTILECFG: {
+ MI.RemoveOperand(0);
+ MI.setDesc(TII->get(X86::LDTILECFG));
+ return true;
+ }
+ case X86::PSTTILECFG: {
+ MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
+ MI.setDesc(TII->get(X86::STTILECFG));
+ return true;
+ }
case X86::PTILELOADDV: {
+ MI.RemoveOperand(8); // Remove $tmmcfg
for (unsigned i = 2; i > 0; --i)
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILELOADD));
return true;
}
case X86::PTDPBSSDV: {
+ MI.RemoveOperand(7); // Remove $tmmcfg
MI.untieRegOperand(4);
for (unsigned i = 3; i > 0; --i)
MI.RemoveOperand(i);
@@ -476,13 +488,14 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
return true;
}
case X86::PTILESTOREDV: {
+ MI.RemoveOperand(8); // Remove $tmmcfg
for (int i = 1; i >= 0; --i)
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILESTORED));
return true;
}
case X86::PTILEZEROV: {
- for (int i = 2; i > 0; --i) // Remove row, col
+ for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILEZERO));
return true;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index fcddfb93b7a3..8339f512158d 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2094,14 +2094,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// Emit tilerelease for AMX kernel.
const MachineRegisterInfo &MRI = MF.getRegInfo();
- const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
- unsigned TileRegNum = RC->getNumRegs();
- for (unsigned I = 0; I < TileRegNum; I++) {
- if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) {
- BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
- break;
- }
- }
+ if (!MRI.reg_nodbg_empty(X86::TMMCFG))
+ BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
}
StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 302a15701d81..a96f73df855d 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4606,6 +4606,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Index = Node->getOperand(5);
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Chain = Node->getOperand(0);
MachineSDNode *CNode;
SDValue Ops[] = {Node->getOperand(2),
@@ -4615,6 +4616,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
Index,
Disp,
Segment,
+ CFG,
Chain};
CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
ReplaceNode(Node, CNode);
@@ -4625,12 +4627,14 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
SDValue Chain = Node->getOperand(0);
unsigned Opc = X86::PTDPBSSDV;
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Ops[] = {Node->getOperand(2),
Node->getOperand(3),
Node->getOperand(4),
Node->getOperand(5),
Node->getOperand(6),
Node->getOperand(7),
+ CFG,
Chain};
MachineSDNode *CNode =
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
@@ -4642,7 +4646,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
unsigned Opc = X86::PTILEZEROV;
SDValue Chain = Node->getOperand(0);
- SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain};
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
MachineSDNode *CNode =
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
ReplaceNode(Node, CNode);
@@ -4713,6 +4718,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Index = Node->getOperand(5);
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Chain = Node->getOperand(0);
MachineSDNode *CNode;
SDValue Ops[] = {Node->getOperand(2),
@@ -4723,6 +4729,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
Disp,
Segment,
Node->getOperand(6),
+ CFG,
Chain};
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
ReplaceNode(Node, CNode);
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 209ebd4b3de3..e4f3290cab9f 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -48,14 +48,23 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
VEX, T8XD;
// Pseduo instruction for RA.
+ let hasSideEffects = 1, mayLoad = 1,
+ Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
+
+ let hasSideEffects = 1, mayStore = 1 in
+ def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
+
def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
GR16:$src2,
- opaquemem:$src3), []>;
+ opaquemem:$src3,
+ TILECFG:$cfg), []>;
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
GR16:$src2, opaquemem:$src3,
- TILE:$src4), []>;
+ TILE:$src4, TILECFG:$cfg), []>;
def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
- GR16:$src2), []>;
+ GR16:$src2,
+ TILECFG:$cfg), []>;
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
@@ -95,7 +104,7 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
let Constraints = "$src4 = $dst" in
def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
GR16:$src2, GR16:$src3, TILE:$src4,
- TILE:$src5, TILE:$src6), []>;
+ TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index fe434bd80f35..d9bab14f0c08 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3808,6 +3808,10 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineOperand &MO = NewMI->getOperand(2);
MO.setReg(VirtReg);
MO.setIsKill(true);
+ } else if (RC->getID() == X86::TILECFGRegClassID) {
+ unsigned Opc = X86::PSTTILECFG;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
} else {
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
@@ -3836,6 +3840,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineOperand &MO = NewMI->getOperand(3);
MO.setReg(VirtReg);
MO.setIsKill(true);
+ } else if (RC->getID() == X86::TILECFGRegClassID) {
+ unsigned Opc = X86::PLDTILECFG;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
} else {
const MachineFunction &MF = *MBB.getParent();
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index a61f9c5cc752..05ee6c6c8384 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -38,7 +38,6 @@
#include "X86InstrBuilder.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
-#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -63,13 +62,8 @@ class X86PreTileConfig : public MachineFunctionPass {
const TargetInstrInfo *TII;
MachineDominatorTree *DomTree = nullptr;
MachineRegisterInfo *MRI = nullptr;
- LiveIntervals *LIS = nullptr;
- SmallVector<Register, 16> VTileRegs;
- MachineInstr *TileConfigMI = nullptr;
- void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx);
MachineInstr *getTileConfigPoint();
- void reloadTileConfig(int FI);
public:
X86PreTileConfig() : MachineFunctionPass(ID) {}
@@ -94,21 +88,20 @@ char X86PreTileConfig::ID = 0;
INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
"Tile Register Configure", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
"Tile Register Configure", false, false)
void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
- AU.addRequired<LiveIntervals>();
- AU.addPreserved<LiveIntervals>();
AU.addRequired<MachineDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
-void X86PreTileConfig::buildConfigMI(MachineBasicBlock::iterator MI,
- int FrameIdx) {
+static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
+ const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI,
+ const X86Subtarget *ST) {
auto *MBB = MI->getParent();
// FIXME: AMX should assume AVX512 enabled.
@@ -118,15 +111,18 @@ void X86PreTileConfig::buildConfigMI(MachineBasicBlock::iterator MI,
BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
.addReg(Zmm, RegState::Undef)
.addReg(Zmm, RegState::Undef);
- TileConfigMI = &*addFrameReference(BuildMI(*MBB, MI, DebugLoc(),
- TII->get(X86::VMOVUPSZmr)),
- FrameIdx)
- .addReg(Zmm);
+ addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
+ FrameIdx)
+ .addReg(Zmm);
}
// build psuedo ldtilecfg
- addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)),
- FrameIdx);
+ Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
+
+ addFrameReference(
+ BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
+
+ return VReg;
}
static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
@@ -155,7 +151,6 @@ MachineInstr *X86PreTileConfig::getTileConfigPoint() {
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
if (RC.getID() != X86::TILERegClassID)
continue;
- VTileRegs.push_back(VirtReg);
// Find the common dominator for all MI that define tile register.
for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
@@ -224,138 +219,23 @@ MachineInstr *X86PreTileConfig::getTileConfigPoint() {
return &*MII;
}
-void X86PreTileConfig::reloadTileConfig(int FI) {
- SmallSet<MachineInstr *, 8> MIVisited;
- const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
- auto TileRegNum = RC->getNumRegs();
-
- for (Register VReg : VTileRegs) {
- BitVector UsableRegs(TRI->getNumRegs());
- for (unsigned I = 0; I < TileRegNum; I++)
- UsableRegs.set(X86::TMM0 + I);
- SmallVector<SlotIndex, 8> RegSlots;
- SmallVector<const uint32_t *, 8> RegMasks;
- LiveInterval &LI = LIS->getInterval(VReg);
- if (!LIS->getInterferenceRegMasks(LI, RegSlots, RegMasks))
- continue;
- for (unsigned I = 0; I < RegSlots.size(); I++) {
- SlotIndex &SI = RegSlots[I];
- MachineInstr *MI = LIS->getInstructionFromIndex(SI);
- // We have reload the tile config register before.
- if (MIVisited.count(MI))
- continue;
- // For inline assembly, we don't reload tile config register.
- // If there is any ldtilecfg instruction in inline assembly,
- // it is user's reponsibility to restore everything.
- if (!MI->isCall())
- continue;
- UsableRegs.clearBitsInMask(RegMasks[I]);
- MIVisited.insert(MI);
- // There is no interference in callee. This is benifited from
- // IPRA.
- if (UsableRegs.none())
- continue;
-
- // build psuedo ldtilecfg
- auto *MBB = MI->getParent();
- auto MII = MachineBasicBlock::iterator(MI);
- MII++;
- addFrameReference(
- BuildMI(*MBB, *MII, DebugLoc(), TII->get(X86::LDTILECFG)), FI);
- }
- }
- // We just check tile data register interference, we also need check tile
- // config register interference. Since we don't model the config register
- // we should check interference from the ldtilecfg to each tile data register
- // def.
- // ldtilecfg
- // / \
- // BB1 BB2
- // / \
- // call BB3
- // / \
- // %1=tileload %2=tilezero
- // We can start from the instruction of each tile def, and backward to
- // ldtilecfg. If there is any call instruction, and tile data register is
- // not preserved, we should insert ldtilecfg after the call instruction.
- SmallSet<MachineBasicBlock *, 8> MBBVisited;
- for (Register VReg : VTileRegs) {
- for (MachineOperand &MO : MRI->def_operands(VReg)) {
- if (MO.isUndef())
- continue;
- MachineInstr *MI = MO.getParent();
- // May be PHI instructiion.
- // There must be several def tile before PHI instruction.
- if (MI->isTransient())
- continue;
-
- bool Terminate = false;
- MachineBasicBlock *MBB = MI->getParent();
- // backward to see if there is any call instruction after ldtilecfg.
- std::queue<MachineBasicBlock *> WorkList;
- WorkList.push(MBB);
- bool First = true;
- while (!WorkList.empty()) {
- MBB = WorkList.front();
- WorkList.pop();
- // If we have iterate the basic block before, don't iterate it and
- // its predecessor again. This may be caused by loop, or it has a
- // cross path from several successor, or it has been iterated when
- // handle other tile register. In below example, BB1 hit the condition.
- // ldtilecfg
- // |
- // ---BB1---
- // / \
- // BB2 BB3
- // / \
- // %1=tileload %2=tilezero
- if (MBBVisited.count(MBB))
- continue;
- // For the first MBB, we start from the amx instruction which def
- // tile register.
- auto I = (First) ? MI->getReverseIterator() : MBB->instr_rbegin();
- for (auto E = MBB->instr_rend(); I != E; ++I) {
- // If it is inserted point for ldtilecfg, then we've finished
- // backward.
- if (&*I == TileConfigMI) {
- Terminate = true;
- break;
- }
- if (MIVisited.count(&*I))
- continue;
- if (!I->isCall())
- continue;
- BitVector UsableRegs(TRI->getNumRegs());
- for (unsigned I = 0; I < TileRegNum; I++)
- UsableRegs.set(X86::TMM0 + I);
- for (MachineOperand &CallMO : I->operands()) {
- if (CallMO.isRegMask())
- UsableRegs.clearBitsInMask(CallMO.getRegMask());
- }
- // Record the call to avoid double ldtilecfg insert.
- MIVisited.insert(&*I);
- if (UsableRegs.none())
- continue;
- // Insert ldtilecfg after call instruction.
- --I;
- addFrameReference(
- BuildMI(*MBB, *I, DebugLoc(), TII->get(X86::LDTILECFG)), FI);
- }
- // We encounter visited MachineInst, so we don't need to do backward
- // again.
- if (Terminate)
- break;
- // Next we will iterate its predecessor.
- for (MachineBasicBlock::pred_iterator S = MBB->pred_begin(),
- E = MBB->pred_end();
- S != E; S++)
- WorkList.push(*S);
+static void addTileCFGUse(MachineFunction &MF, Register CFG) {
+ for (MachineBasicBlock &MBB : MF) {
- // The first the MBB may be visited for the second time when it is in
- // a loop.
- if (!First)
- MBBVisited.insert(MBB);
- First = false;
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ break;
+ case X86::PTILELOADDV:
+ case X86::PTILESTOREDV:
+ case X86::PTDPBSSDV:
+ case X86::PTILEZEROV:
+ unsigned NumOperands = MI.getNumOperands();
+ MI.RemoveOperand(NumOperands - 1);
+ MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
+ break;
}
}
}
@@ -368,17 +248,15 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
TRI = ST->getRegisterInfo();
TII = mf.getSubtarget().getInstrInfo();
DomTree = &getAnalysis<MachineDominatorTree>();
- LIS = &getAnalysis<LiveIntervals>();
- auto *TileConfigPoint = getTileConfigPoint();
- if (!TileConfigPoint)
+ MachineInstr *MI = getTileConfigPoint();
+ if (!MI)
return false;
unsigned Size = ST->getTileConfigSize();
Align Alignment = ST->getTileConfigAlignment();
int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
- buildConfigMI(TileConfigPoint, SS);
- reloadTileConfig(SS);
- VTileRegs.clear();
+ Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
+ addTileCFGUse(mf, CFG);
return true;
}
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index c8723c8268f2..75cbd4e1cff1 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -639,3 +639,8 @@ def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
let CopyCost = -1 in // Don't allow copying of tile registers
def TILE : RegisterClass<"X86", [x86amx], 8192,
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
+def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
+ let CopyCost = -1; // Don't allow copying of tile config registers.
+ let isAllocatable = 1;
+ let Size = 512;
+}
diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp
index d6c1dcaf0588..ef010bcd38b7 100644
--- a/llvm/lib/Target/X86/X86TileConfig.cpp
+++ b/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -22,7 +22,6 @@
#include "X86MachineFunctionInfo.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
-#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -131,14 +130,13 @@ static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
}
MachineInstr *X86TileConfig::getTileConfigPoint() {
- MachineBasicBlock *Entry = &*MF->begin();
- ReversePostOrderTraversal<MachineBasicBlock *> RPOT(Entry);
- for (MachineBasicBlock *MBB : RPOT) {
- for (MachineInstr &MI : *MBB)
+ for (MachineBasicBlock &MBB : *MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB)
// Refer X86PreTileConfig.cpp.
- // We only support one tile config for now. The other ldtilecfg
- // is for spill purpose and is dominated by the first ldtilecfg.
- if (MI.getOpcode() == X86::LDTILECFG)
+ // We only support one tile config for now.
+ if (MI.getOpcode() == X86::PLDTILECFG)
return &MI;
}
@@ -150,7 +148,7 @@ void X86TileConfig::tileConfig() {
if (!MI)
return;
MachineBasicBlock *MBB = MI->getParent();
- int SS = MI->getOperand(0).getIndex();
+ int SS = MI->getOperand(1).getIndex();
BitVector PhysRegs(TRI->getNumRegs());
// Fill in the palette first.
diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
index 87973fd9c315..a68a81b8d732 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
@@ -1,34 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
- at buf = dso_local global [3072 x i8] zeroinitializer, align 64
+%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
-define internal void @foo() #0 {
-; CHECK-LABEL: foo:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movq %rsp, %rbp
-; CHECK-NEXT: .cfi_def_cfa_register %rbp
-; CHECK-NEXT: popq %rbp
-; CHECK-NEXT: .cfi_def_cfa %rsp, 8
-; CHECK-NEXT: retq
-;
-; IPRA-LABEL: foo:
-; IPRA: # %bb.0: # %entry
-; IPRA-NEXT: pushq %rbp
-; IPRA-NEXT: .cfi_def_cfa_offset 16
-; IPRA-NEXT: .cfi_offset %rbp, -16
-; IPRA-NEXT: movq %rsp, %rbp
-; IPRA-NEXT: .cfi_def_cfa_register %rbp
-; IPRA-NEXT: popq %rbp
-; IPRA-NEXT: .cfi_def_cfa %rsp, 8
-; IPRA-NEXT: retq
-entry:
- ret void
-}
+ at buf = dso_local global [3072 x i8] zeroinitializer, align 64
define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-LABEL: test_api:
@@ -50,6 +25,7 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
; CHECK-NEXT: movl $buf, %eax
; CHECK-NEXT: movl $32, %r14d
; CHECK-NEXT: movw $8, %r15w
@@ -60,10 +36,11 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
-; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $buf+2048, %eax
+; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
; CHECK-NEXT: movabsq $64, %rcx
; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
@@ -78,48 +55,17 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: tilerelease
; CHECK-NEXT: retq
-;
-; IPRA-LABEL: test_api:
-; IPRA: # %bb.0:
-; IPRA-NEXT: pushq %rbp
-; IPRA-NEXT: subq $64, %rsp
-; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; IPRA-NEXT: vmovdqu64 %zmm0, (%rsp)
-; IPRA-NEXT: movb $1, (%rsp)
-; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; IPRA-NEXT: ldtilecfg (%rsp)
-; IPRA-NEXT: movl $buf, %eax
-; IPRA-NEXT: movl $32, %ecx
-; IPRA-NEXT: movw $8, %dx
-; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0
-; IPRA-NEXT: movl $buf+1024, %eax
-; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1
-; IPRA-NEXT: callq foo
-; IPRA-NEXT: movl $buf+2048, %eax
-; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm2
-; IPRA-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
-; IPRA-NEXT: tilestored %tmm2, (%rax,%rcx)
-; IPRA-NEXT: addq $64, %rsp
-; IPRA-NEXT: popq %rbp
-; IPRA-NEXT: tilerelease
-; IPRA-NEXT: vzeroupper
-; IPRA-NEXT: retq
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
- call void @foo()
+ tail call void (...) @foo()
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
ret void
}
+declare dso_local void @foo(...)
+
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
-
-attributes #0 = { noinline nounwind optnone uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
index f38554b9f79d..a415d9c15242 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
@@ -5,6 +5,7 @@ define void @test_amx() {
; CHECK-LABEL: test_amx:
; CHECK: # %bb.0:
; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3
+; CHECK-NEXT: retq
call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7)
ret void
}
diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
deleted file mode 100644
index b381429c9374..000000000000
--- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
- at buf = dso_local global [3072 x i8] zeroinitializer, align 16
-
-define dso_local void @test1(i16 signext %0, i16 signext %1) local_unnamed_addr {
-; CHECK-LABEL: test1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movl $buf, %eax
-; CHECK-NEXT: movl $32, %ecx
-; CHECK-NEXT: movw $8, %dx
-; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0
-; CHECK-NEXT: movl $buf+1024, %eax
-; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1
-; CHECK-NEXT: movl $buf+2048, %eax
-; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
-; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx)
-; CHECK-NEXT: tilerelease
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: jmp foo # TAILCALL
- %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
- %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
- %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
- %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
- tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
- tail call void @foo()
- ret void
-}
-
-define dso_local void @test2(i16 signext %0, i16 signext %1) local_unnamed_addr {
-; CHECK-LABEL: test2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 24
-; CHECK-NEXT: subq $72, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 96
-; CHECK-NEXT: .cfi_offset %rbx, -24
-; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edi, %ebp
-; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB1_3
-; CHECK-NEXT: # %bb.1: # %if.true
-; CHECK-NEXT: movw $8, %ax
-; CHECK-NEXT: tilezero %tmm0
-; CHECK-NEXT: movl $32, %ecx
-; CHECK-NEXT: movl $buf+1024, %edx
-; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm1
-; CHECK-NEXT: movl $buf+2048, %edx
-; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2
-; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
-; CHECK-NEXT: tilestored %tmm0, (%rdx,%rcx)
-; CHECK-NEXT: jmp .LBB1_2
-; CHECK-NEXT: .LBB1_3: # %if.false
-; CHECK-NEXT: movl $buf, %eax
-; CHECK-NEXT: movl $32, %ecx
-; CHECK-NEXT: movw $8, %dx
-; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm3
-; CHECK-NEXT: movl $buf+1024, %eax
-; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4
-; CHECK-NEXT: movl $buf+2048, %eax
-; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
-; CHECK-NEXT: tdpbssd %tmm2, %tmm4, %tmm3
-; CHECK-NEXT: tilestored %tmm3, (%rax,%rcx)
-; CHECK-NEXT: .LBB1_2: # %if.true
-; CHECK-NEXT: addq $72, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 24
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: popq %rbp
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: tilerelease
-; CHECK-NEXT: retq
- call void @foo()
- br i1 undef, label %if.true, label %if.false
-
-if.true:
- %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8)
- %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
- %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
- %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
- tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4)
- br label %exit
-
-if.false:
- %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
- %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
- %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
- %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7)
- tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8)
- br label %exit
-
-exit:
- ret void
-}
-
-declare dso_local void @foo() local_unnamed_addr
-declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
-declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
-declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
index 57b67c456b36..0dc0c34c340c 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
@@ -36,10 +36,11 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: tileloadd (%r15,%r14), %tmm5
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_2
-; CHECK-NEXT: # %bb.1: # %if.true
+; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
; CHECK-NEXT: movl $buf, %eax
; CHECK-NEXT: movw $8, %cx
+; CHECK-NEXT: jne .LBB0_2
+; CHECK-NEXT: # %bb.1: # %if.true
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
; CHECK-NEXT: movl $buf+1024, %eax
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1
@@ -51,13 +52,11 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
-; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_2: # %if.false
-; CHECK-NEXT: movl $buf, %eax
-; CHECK-NEXT: movw $8, %cx
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
; CHECK-NEXT: movl $buf+1024, %eax
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm3
@@ -69,7 +68,7 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
-; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
; CHECK-NEXT: tilestored %tmm6, (%r15,%r14)
@@ -140,6 +139,7 @@ define dso_local void @test3(i8 *%buf) nounwind {
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: movl $32, %r14d
; CHECK-NEXT: xorl %ebp, %ebp
+; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_2: # %loop.header
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
@@ -149,7 +149,7 @@ define dso_local void @test3(i8 *%buf) nounwind {
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
-; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 1e1154b5f759..b851eea60b0a 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -120,8 +120,6 @@
; CHECK-NEXT: X86 EFLAGS copy lowering
; CHECK-NEXT: X86 WinAlloca Expander
; CHECK-NEXT: MachineDominator Tree Construction
-; CHECK-NEXT: Slot index numbering
-; CHECK-NEXT: Live Interval Analysis
; CHECK-NEXT: Tile Register Pre-configure
; CHECK-NEXT: Detect Dead Lanes
; CHECK-NEXT: Process Implicit Definitions
More information about the llvm-branch-commits
mailing list