[llvm] [AMDGPU] Add support for GFX12 expert scheduling mode 2 (PR #170319)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 8 07:14:35 PST 2025
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/170319
>From c777f643b08f8fd979750cf6985e11fff1eff9b9 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 27 Nov 2025 16:34:45 +0000
Subject: [PATCH 1/6] [AMDGPU] Add support for GFX12 expert scheduling mode 2
---
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 +
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 273 ++++++++++-
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 11 +
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 22 +-
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 17 +-
llvm/lib/Target/AMDGPU/VOP2Instructions.td | 2 +-
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 20 +-
llvm/lib/Target/AMDGPU/VOPCInstructions.td | 27 +-
llvm/lib/Target/AMDGPU/VOPInstructions.td | 12 +
.../AMDGPU/expert_scheduling_gfx12.mir | 454 ++++++++++++++++++
.../AMDGPU/function-esm2-prologue-epilogue.ll | 26 +
.../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 207 ++++++++
12 files changed, 1038 insertions(+), 37 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx12.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/function-esm2-prologue-epilogue.ll
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index e567176e658b3..d5c72e52c3cee 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1478,6 +1478,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
+ /// \returns true if the target supports using software to avoid hazards
+ /// between VMEM and VALU instructions in some instances.
+ bool hasSoftwareHazardMode() const { return getGeneration() >= GFX12; }
+
/// \returns true if the target has s_wait_xcnt insertion. Supported for
/// GFX1250.
bool hasWaitXCnt() const { return HasWaitXcnt; }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 70db7b4918515..a8613d586ed0f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -63,6 +63,12 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
cl::desc("Force all waitcnt load counters to wait until 0"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> SoftwareHazardModeFlag(
+ "amdgpu-software-hazard-mode",
+ cl::desc("Enable expert scheduling mode 2 for all kernel functions (GFX12+ "
+ "only)"),
+ cl::init(false), cl::Hidden);
+
namespace {
// Class of object that encapsulates latest instruction counter score
// associated with the operand. Used for determining whether
@@ -79,7 +85,10 @@ enum InstCounterType {
KM_CNT, // gfx12+ only.
X_CNT, // gfx1250.
NUM_EXTENDED_INST_CNTS,
- NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
+ VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
+ VM_VSRC, // gfx12+ expert mode only.
+ NUM_EXPERT_INST_CNTS,
+ NUM_INST_CNTS = NUM_EXPERT_INST_CNTS
};
} // namespace
@@ -108,6 +117,8 @@ struct HardwareLimits {
unsigned BvhcntMax; // gfx12+ only.
unsigned KmcntMax; // gfx12+ only.
unsigned XcntMax; // gfx1250.
+ unsigned VaVdstMax; // gfx12+ expert mode only.
+ unsigned VmVsrcMax; // gfx12+ expert mode only.
};
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
@@ -129,7 +140,14 @@ struct HardwareLimits {
DECL(EXP_POS_ACCESS) /* write to export position */ \
DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
- DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */
+ DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
+ DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
+ DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
+ DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
+ DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
+ DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
+ DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
+ DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
// clang-format off
#define AMDGPU_EVENT_ENUM(Name) Name,
@@ -187,7 +205,7 @@ enum VmemType {
// Maps values of InstCounterType to the instruction that waits on that
// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
-// returns true.
+// returns true, and does not cover VA_VDST or VM_VSRC.
static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
@@ -204,6 +222,10 @@ static bool isNormalMode(InstCounterType MaxCounter) {
}
#endif // NDEBUG
+static bool isExpertMode(InstCounterType MaxCounter) {
+ return MaxCounter == NUM_EXPERT_INST_CNTS;
+}
+
VmemType getVmemType(const MachineInstr &Inst) {
assert(updateVMCntOnly(Inst));
if (!SIInstrInfo::isImage(Inst))
@@ -242,6 +264,10 @@ unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
return Wait.KmCnt;
case X_CNT:
return Wait.XCnt;
+ case VA_VDST:
+ return Wait.VaVdst;
+ case VM_VSRC:
+ return Wait.VmVsrc;
default:
llvm_unreachable("bad InstCounterType");
}
@@ -371,6 +397,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
0,
0,
0,
+ 0,
+ 0,
0};
return WaitEventMaskForInstPreGFX12;
@@ -407,7 +435,10 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
eventMask({VMEM_SAMPLER_READ_ACCESS}),
eventMask({VMEM_BVH_READ_ACCESS}),
eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
- eventMask({VMEM_GROUP, SMEM_GROUP})};
+ eventMask({VMEM_GROUP, SMEM_GROUP}),
+ eventMask({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
+ VGPR_XDL_WRITE}),
+ eventMask({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
return WaitEventMaskForInstGFX12Plus;
}
@@ -482,6 +513,10 @@ class SIInsertWaitcnts {
return Limits.KmcntMax;
case X_CNT:
return Limits.XcntMax;
+ case VA_VDST:
+ return Limits.VaVdstMax;
+ case VM_VSRC:
+ return Limits.VmVsrcMax;
default:
break;
}
@@ -524,6 +559,9 @@ class SIInsertWaitcnts {
ForceEmitWaitcnt[SAMPLE_CNT] = false;
ForceEmitWaitcnt[BVH_CNT] = false;
}
+
+ ForceEmitWaitcnt[VA_VDST] = false;
+ ForceEmitWaitcnt[VM_VSRC] = false;
#endif // NDEBUG
}
@@ -563,6 +601,9 @@ class SIInsertWaitcnts {
return VmemReadMapping[getVmemType(Inst)];
}
+ std::optional<WaitEventType>
+ getSoftwareHazardEventType(const MachineInstr &Inst) const;
+
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
@@ -580,6 +621,8 @@ class SIInsertWaitcnts {
WaitcntBrackets &ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
+ void setSchedulingMode(MachineBasicBlock &MBB, MachineInstr &MI,
+ bool ExpertMode) const;
};
// This objects maintains the current score brackets of each wait counter, and
@@ -639,6 +682,7 @@ class WaitcntBrackets {
bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);
+ void simplifyVmVsrc(AMDGPU::Waitcnt &Wait);
void determineWait(InstCounterType T, RegInterval Interval,
AMDGPU::Waitcnt &Wait) const;
@@ -925,6 +969,8 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
+ assert((T != VA_VDST && T != VM_VSRC) || isExpertMode(Context->MaxCounter));
+
unsigned UB = getScoreUB(T);
unsigned CurrScore = UB + 1;
if (CurrScore == 0)
@@ -1027,6 +1073,20 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
}
for (const MachineOperand &Op : Inst.all_uses())
setScoreByOperand(&Inst, Op, T, CurrScore);
+ } else if (T == VA_VDST || T == VM_VSRC) {
+ // Match the score to the VGPR destination or source registers as
+ // appropriate
+ for (const MachineOperand &Op : Inst.operands()) {
+ if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
+ (T == VM_VSRC && Op.isDef()))
+ continue;
+ RegInterval Interval = getRegInterval(&Inst, Op);
+ if (Interval.first >= NUM_ALL_VGPRS)
+ continue;
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ setRegScore(RegNo, T, CurrScore);
+ }
+ }
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
// Match the score to the destination registers.
//
@@ -1141,6 +1201,12 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
case X_CNT:
OS << " X_CNT(" << SR << "): ";
break;
+ case VA_VDST:
+ OS << " VA_VDST(" << SR << "): ";
+ break;
+ case VM_VSRC:
+ OS << " VM_VSRC(" << SR << "): ";
+ break;
default:
OS << " UNKNOWN(" << SR << "): ";
break;
@@ -1204,6 +1270,8 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
simplifyWaitcnt(KM_CNT, Wait.KmCnt);
simplifyXcnt(Wait, Wait);
+ simplifyWaitcnt(VA_VDST, Wait.VaVdst);
+ simplifyVmVsrc(Wait);
}
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1215,6 +1283,17 @@ void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
Count = ~0u;
}
+void WaitcntBrackets::simplifyVmVsrc(AMDGPU::Waitcnt &Wait) {
+ // Waiting for some counters implies waiting for VM_VSRC, since an
+ // instruction that decrements a counter on completion would have
+ // decremented VM_VSRC once its VGPR operands had been read. Make
+ // sure any pending VM_VSRC events are cleared in this case.
+ if (hasPendingEvent(VM_VSRC))
+ applyWaitcnt(VM_VSRC, std::min({Wait.LoadCnt, Wait.StoreCnt, Wait.SampleCnt,
+ Wait.BvhCnt, Wait.DsCnt}));
+ simplifyWaitcnt(VM_VSRC, Wait.VmVsrc);
+}
+
void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
AMDGPU::Waitcnt &Wait) const {
const unsigned LB = getScoreLB(T);
@@ -1274,6 +1353,8 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(BVH_CNT, Wait.BvhCnt);
applyWaitcnt(KM_CNT, Wait.KmCnt);
applyWaitcnt(X_CNT, Wait.XCnt);
+ applyWaitcnt(VA_VDST, Wait.VaVdst);
+ applyWaitcnt(VM_VSRC, Wait.VmVsrc);
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -1571,8 +1652,9 @@ WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
AMDGPU::Waitcnt
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+ unsigned ExpertVal = isExpertMode(MaxCounter) ? 0 : ~0u;
return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
- ~0u /* XCNT */);
+ ~0u /* XCNT */, ExpertVal, ExpertVal);
}
/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -1588,6 +1670,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
bool Modified = false;
MachineInstr *CombinedLoadDsCntInstr = nullptr;
MachineInstr *CombinedStoreDsCntInstr = nullptr;
+ MachineInstr *WaitcntDepctrInstr = nullptr;
MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
LLVM_DEBUG({
@@ -1635,6 +1718,16 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
UpdatableInstr = &CombinedStoreDsCntInstr;
+ } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
+ unsigned OldEnc =
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ AMDGPU::Waitcnt OldWait;
+ OldWait.VaVdst = AMDGPU::DepCtr::decodeFieldVaVdst(OldEnc);
+ OldWait.VmVsrc = AMDGPU::DepCtr::decodeFieldVmVsrc(OldEnc);
+ if (TrySimplify)
+ ScoreBrackets.simplifyWaitcnt(OldWait);
+ Wait = Wait.combined(OldWait);
+ UpdatableInstr = &WaitcntDepctrInstr;
} else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
// Architectures higher than GFX10 do not have direct loads to
// LDS, so no work required here yet.
@@ -1654,6 +1747,24 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
// Merge consecutive waitcnt of the same type by erasing multiples.
if (!*UpdatableInstr) {
*UpdatableInstr = &II;
+ } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
+ // S_WAITCNT_DEPCTR requires special care. Don't remove a
+ // duplicate if it is waiting on things other than VA_VDST or
+ // VM_VSRC. If that is the case, just make sure the VA_VDST and
+ // VM_VSRC subfields of the operand are set to the "no wait"
+ // values.
+
+ unsigned Enc = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
+ Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
+
+ if (Enc != 0xffff) {
+ Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
+ Modified |= promoteSoftWaitCnt(&II);
+ } else {
+ II.eraseFromParent();
+ Modified = true;
+ }
} else {
II.eraseFromParent();
Modified = true;
@@ -1670,6 +1781,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
// createNewWaitcnt(). As a side effect, resetting the wait counts will
// cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
// the loop below that deals with single counter instructions.
+ //
+ // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
+ // instructions that have decremented LOAD_CNT or DS_CNT on completion
+ // will have needed to wait for their register sources to be available
+ // first.
if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
@@ -1784,6 +1900,39 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
}
}
+ if (WaitcntDepctrInstr) {
+ // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
+ // subfields with the new required values.
+ unsigned Enc =
+ TII->getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
+ ->getImm();
+ Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, Wait.VmVsrc);
+ Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
+
+ ScoreBrackets.applyWaitcnt(VA_VDST, Wait.VaVdst);
+ ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.VmVsrc);
+ Wait.VaVdst = ~0u;
+ Wait.VmVsrc = ~0u;
+
+ // If that new encoded Depctr immediate would actually still wait
+ // for anything, update the instruction's operand. Otherwise it can
+ // just be deleted.
+ if (Enc != 0xffff) {
+ Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
+ AMDGPU::OpName::simm16, Enc);
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: "
+ << *WaitcntDepctrInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntDepctrInstr << '\n');
+ } else {
+ WaitcntDepctrInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
return Modified;
}
@@ -1848,6 +1997,24 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
dbgs() << "New Instr: " << *SWaitInst << '\n');
}
+ if (Wait.hasWaitDepctr()) {
+ assert(isExpertMode(MaxCounter));
+ unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.VmVsrc, *ST);
+ Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
+
+ if (Enc != 0xffff) {
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(Enc);
+
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
+ }
+
return Modified;
}
@@ -2044,6 +2211,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (Op.isImplicit() && MI.mayLoadOrStore())
continue;
+ ScoreBrackets.determineWait(VA_VDST, Interval, Wait);
+ if (Op.isDef())
+ ScoreBrackets.determineWait(VM_VSRC, Interval, Wait);
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
// previous write and this write are the same type of VMEM
// instruction, in which case they are (in some architectures)
@@ -2105,6 +2275,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);
+ // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
+ // waits on VA_VDST if the instruction it would precede is not a VALU
+ // instruction, since hardware handles VALU->VGPR->VALU hazards in
+ // expert scheduling mode.
+ if (TII->isVALU(MI))
+ Wait.VaVdst = ~0u;
+
// Since the translation for VMEM addresses occur in-order, we can apply the
// XCnt if the current instruction is of VMEM type and has a memory
// dependency with another VMEM instruction in flight.
@@ -2132,6 +2309,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.KmCnt = 0;
if (ForceEmitWaitcnt[X_CNT])
Wait.XCnt = 0;
+ // Only force emit VA_VDST and VM_VSRC if expert mode is enabled.
+ if (isExpertMode(MaxCounter)) {
+ if (ForceEmitWaitcnt[VA_VDST])
+ Wait.VaVdst = 0;
+ if (ForceEmitWaitcnt[VM_VSRC])
+ Wait.VmVsrc = 0;
+ }
if (FlushVmCnt) {
if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
@@ -2187,6 +2371,43 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
return Modified;
}
+std::optional<WaitEventType>
+SIInsertWaitcnts::getSoftwareHazardEventType(const MachineInstr &Inst) const {
+ if (TII->isVALU(Inst)) {
+ // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
+ // out-of-order with respect to each other, so each of these classes
+ // has its own event.
+
+ if (TII->isXDL(Inst))
+ return VGPR_XDL_WRITE;
+
+ if (TII->isTRANS(Inst))
+ return VGPR_TRANS_WRITE;
+
+ if (AMDGPU::isDPMACCInstruction(Inst.getOpcode()))
+ return VGPR_DPMACC_WRITE;
+
+ return VGPR_CSMACC_WRITE;
+ }
+
+ // FLAT and LDS instructions may read their VGPR sources out-of-order
+ // with respect to each other and all other VMEM instructions, so
+ // each of these also has a separate event.
+
+ if (TII->isFLAT(Inst))
+ return VGPR_FLAT_READ;
+
+ if (TII->isDS(Inst))
+ return VGPR_LDS_READ;
+
+ if (TII->isVMEM(Inst) || TII->isVIMAGE(Inst) || TII->isVSAMPLE(Inst))
+ return VGPR_VMEM_READ;
+
+ // Otherwise, no hazard.
+
+ return {};
+}
+
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
@@ -2261,6 +2482,12 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
bool IsVMEMAccess = false;
bool IsSMEMAccess = false;
+
+ if (isExpertMode(MaxCounter)) {
+ if (const auto ET = getSoftwareHazardEventType(Inst))
+ ScoreBrackets->updateByEvent(*ET, Inst);
+ }
+
if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
if (TII->isAlwaysGDS(Inst.getOpcode()) ||
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
@@ -2452,6 +2679,16 @@ static bool isWaitInstr(MachineInstr &Inst) {
counterTypeForInstr(Opcode).has_value();
}
+void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
+ MachineInstr &MI,
+ bool ExpertMode) const {
+ const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
+ AMDGPU::Hwreg::ID_SCHED_MODE, AMDGPU::Hwreg::HwregOffset::Default, 2);
+ BuildMI(MBB, MI, DebugLoc(), TII->get(AMDGPU::S_SETREG_IMM32_B32))
+ .addImm(ExpertMode ? 2 : 0)
+ .addImm(EncodedReg);
+}
+
// Generate s_waitcnt instructions where needed.
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineBasicBlock &Block,
@@ -2492,7 +2729,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Track pre-existing waitcnts that were added in earlier iterations or by
// the memory legalizer.
- if (isWaitInstr(Inst)) {
+ if (isWaitInstr(Inst) || (isExpertMode(MaxCounter) &&
+ Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
if (!OldWaitcntInstr)
OldWaitcntInstr = &Inst;
++Iter;
@@ -2741,7 +2979,14 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
if (ST->hasExtendedWaitCounts()) {
- MaxCounter = NUM_EXTENDED_INST_CNTS;
+ if (ST->hasSoftwareHazardMode() &&
+ (MF.getFunction()
+ .getFnAttribute("amdgpu-software-hazard-mode")
+ .getValueAsBool() ||
+ SoftwareHazardModeFlag))
+ MaxCounter = NUM_EXPERT_INST_CNTS;
+ else
+ MaxCounter = NUM_EXTENDED_INST_CNTS;
WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
WCG = &WCGGFX12Plus;
} else {
@@ -2770,6 +3015,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
+ Limits.VaVdstMax = AMDGPU::DepCtr::getVaVdstBitMask();
+ Limits.VmVsrcMax = AMDGPU::DepCtr::getVmVsrcBitMask();
[[maybe_unused]] unsigned NumVGPRsMax =
ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
@@ -2809,6 +3056,12 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
TII->get(instrsForExtendedCounterTypes[CT]))
.addImm(0);
}
+ if (isExpertMode(MaxCounter)) {
+ unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, *ST);
+ Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, 0);
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(Enc);
+ }
} else {
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
}
@@ -2817,6 +3070,12 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
NonKernelInitialState->setStateOnFunctionEntryOrReturn();
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
+ Modified = true;
+ } else if (isExpertMode(MaxCounter)) {
+ for (MachineBasicBlock::iterator E = EntryBB.end();
+ I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
+ ;
+ setSchedulingMode(EntryBB, *I, true);
Modified = true;
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 90f0b49ab9a78..5d5ba74cfa5b1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -424,6 +424,8 @@ struct FP4FP8DstByteSelInfo {
bool HasFP4DstByteSel;
};
+#define GET_DPMACCInstructionTable_DECL
+#define GET_DPMACCInstructionTable_IMPL
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
@@ -780,6 +782,11 @@ FPType getFPDstSelType(unsigned Opc) {
return FPType::None;
}
+bool isDPMACCInstruction(unsigned Opc) {
+ const DPMACCInstructionInfo *Info = getDPMACCInstructionHelper(Opc);
+ return Info && Info->IsDPMACCInstruction;
+}
+
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
return Info ? Info->Opcode3Addr : ~0u;
@@ -2020,6 +2027,10 @@ int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask,
STI);
}
+unsigned getVaVdstBitMask() { return (1 << getVaVdstBitWidth()) - 1; }
+
+unsigned getVmVsrcBitMask() { return (1 << getVmVsrcBitWidth()) - 1; }
+
unsigned decodeFieldVmVsrc(unsigned Encoded) {
return unpackBits(Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth());
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 3a352006e006c..03e33409ba27a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1089,6 +1089,8 @@ struct Waitcnt {
unsigned BvhCnt = ~0u; // gfx12+ only.
unsigned KmCnt = ~0u; // gfx12+ only.
unsigned XCnt = ~0u; // gfx1250.
+ unsigned VaVdst = ~0u; // gfx12+ expert scheduling mode only.
+ unsigned VmVsrc = ~0u; // gfx12+ expert scheduling mode only.
Waitcnt() = default;
// Pre-gfx12 constructor.
@@ -1097,19 +1099,24 @@ struct Waitcnt {
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
- unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt)
+ unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt,
+ unsigned VaVdst, unsigned VmVsrc)
: LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
- SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt) {}
+ SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt),
+ VaVdst(VaVdst), VmVsrc(VmVsrc) {}
bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
bool hasWaitExceptStoreCnt() const {
return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u ||
- SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u;
+ SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u ||
+ VaVdst != ~0u || VmVsrc != ~0u;
}
bool hasWaitStoreCnt() const { return StoreCnt != ~0u; }
+ bool hasWaitDepctr() const { return VaVdst != ~0u || VmVsrc != ~0u; }
+
Waitcnt combined(const Waitcnt &Other) const {
// Does the right thing provided self and Other are either both pre-gfx12
// or both gfx12+.
@@ -1117,7 +1124,8 @@ struct Waitcnt {
std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt),
std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt),
std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt),
- std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt));
+ std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt),
+ std::min(VaVdst, Other.VaVdst), std::min(VmVsrc, Other.VmVsrc));
}
};
@@ -1279,6 +1287,12 @@ bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal,
bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val,
bool &IsDefault, const MCSubtargetInfo &STI);
+/// \returns Maximum VaVdst value that can be encoded.
+unsigned getVaVdstBitMask();
+
+/// \returns Maximum VmVsrc value that can be encoded.
+unsigned getVmVsrcBitMask();
+
/// \returns Decoded VaVdst from given immediate \p Encoded.
unsigned decodeFieldVaVdst(unsigned Encoded);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 1d1e95908fce6..4788406f112ba 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -263,16 +263,19 @@ let HasOMod = 0, HasClamp = 0 in {
let isReMaterializable = 1 in {
let SchedRW = [WriteDoubleCvt] in {
// OMod clears exceptions when set in this instruction
+let IsDPMACCInstruction = 1 in
defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_sint>;
let mayRaiseFPException = 0 in {
defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
}
+let IsDPMACCInstruction = 1 in {
defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
-defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, any_fpextend>;
// OMod clears exceptions when set in this instruction
defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_uint>;
+} // IsDPMACCInstruction = 1
+defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, any_fpextend>;
let mayRaiseFPException = 0 in {
defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
@@ -349,11 +352,11 @@ defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, int_amdgcn_sqrt>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
-let TRANS = 1, SchedRW = [WriteTrans64] in {
+let TRANS = 1, SchedRW = [WriteTrans64], IsDPMACCInstruction = 1 in {
defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64_NO_DPP, AMDGPUrcp>;
defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64_NO_DPP, AMDGPUrsq>;
defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64_NO_DPP, int_amdgcn_sqrt>;
-} // End TRANS = 1, SchedRW = [WriteTrans64]
+} // End TRANS = 1, SchedRW = [WriteTrans64], IsDPMACCInstruction = 1
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
@@ -369,13 +372,13 @@ defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>;
defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>;
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in {
defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64_SPECIAL_OMOD, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
let FPDPRounding = 1 in {
defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
} // End FPDPRounding = 1
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1
defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
@@ -493,12 +496,12 @@ let SubtargetPredicate = isGFX7GFX8GFX9 in {
} // End SubtargetPredicate = isGFX7GFX8GFX9
let SubtargetPredicate = isGFX7Plus in {
- let SchedRW = [WriteDoubleAdd] in {
+ let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in {
defm V_TRUNC_F64 : VOP1Inst<"v_trunc_f64", VOP_F64_F64, ftrunc>;
defm V_CEIL_F64 : VOP1Inst<"v_ceil_f64", VOP_F64_F64, fceil>;
defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, froundeven>;
defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>;
- } // End SchedRW = [WriteDoubleAdd]
+ } // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1
} // End SubtargetPredicate = isGFX7Plus
} // End isReMaterializable = 1
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index dbb7862ab4ab5..d94602f3b8385 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1481,7 +1481,7 @@ let SubtargetPredicate = isGFX12Plus, isReMaterializable = 1 in {
} // End SubtargetPredicate = isGFX12Plus, isReMaterializable = 1
let SubtargetPredicate = HasIEEEMinimumMaximumInsts, isReMaterializable = 1,
- SchedRW = [WriteDoubleAdd], isCommutable = 1 in {
+ SchedRW = [WriteDoubleAdd], isCommutable = 1, IsDPMACCInstruction = 1 in {
defm V_MIN_NUM_F64 : VOP2Inst_VOPD <"v_min_num_f64", VOP_F64_F64_F64, 0x24, "v_min_num_f64", fminnum_like>;
defm V_MAX_NUM_F64 : VOP2Inst_VOPD <"v_max_num_f64", VOP_F64_F64_F64, 0x23, "v_max_num_f64", fmaxnum_like>;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index faab9f3062829..e405d9405be82 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -199,9 +199,11 @@ let SchedRW = [WriteIntMul] in {
let SchedRW = [WriteDoubleAdd] in {
let FPDPRounding = 1 in {
+let IsDPMACCInstruction = 1 in
defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">;
let SubtargetPredicate = isNotGFX12Plus in {
defm V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd>;
+let IsDPMACCInstruction = 1 in
defm V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fmul>;
} // End SubtargetPredicate = isNotGFX12Plus
} // End FPDPRounding = 1
@@ -224,10 +226,10 @@ defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, f
defm V_MINIMUM_F16 : VOP3Inst_t16 <"v_minimum_f16", VOP_F16_F16_F16, fminimum>;
defm V_MAXIMUM_F16 : VOP3Inst_t16 <"v_maximum_f16", VOP_F16_F16_F16, fmaximum>;
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in {
defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>;
defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaximum>;
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1
} // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1
let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in {
@@ -252,7 +254,7 @@ defm V_DIV_FMAS_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_fmas_f32", VOP_F32_F32_F32
// if (vcc)
// result *= 2^64
//
-let SchedRW = [WriteDouble], FPDPRounding = 1 in
+let SchedRW = [WriteDouble], FPDPRounding = 1, IsDPMACCInstruction = 1 in
defm V_DIV_FMAS_F64 : VOP3Inst <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC>;
} // End Uses = [MODE, VCC, EXEC]
@@ -317,10 +319,10 @@ defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_
defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>;
-let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
+let SchedRW = [WriteDoubleAdd], FPDPRounding = 1, IsDPMACCInstruction = 1 in {
defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP_F64_F64_F64_F64_DPP_PROF, AMDGPUdiv_fixup>;
defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, any_fldexp>;
-} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
+} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1, IsDPMACCInstruction = 1
} // End isReMaterializable = 1
let SubtargetPredicate = isGFX9GFX10 in
@@ -358,7 +360,7 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d
defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32> ;
// Double precision division pre-scale.
- let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1 in
+ let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1, IsDPMACCInstruction = 1 in
defm V_DIV_SCALE_F64 : VOP3Inst <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64>;
} // End mayRaiseFPException = 0
@@ -371,12 +373,12 @@ defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64
let isReMaterializable = 1 in {
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDouble], IsDPMACCInstruction = 1 in {
defm V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop>;
-} // End SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble], IsDPMACCInstruction = 1
let SchedRW = [Write64Bit] in {
- let SubtargetPredicate = isGFX6GFX7 in {
+ let SubtargetPredicate = isGFX6GFX7, IsDPMACCInstruction = 1 in {
defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, cshl_64>;
defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, csrl_64>;
defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index a829b807f33e8..bfa377066a602 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -518,8 +518,10 @@ multiclass VOPC_F16 <string opName, SDPatternOperator cond = COND_NULL,
multiclass VOPC_F32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>;
-multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
+let IsDPMACCInstruction = 1 in {
+ multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
+}
multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
string revOp = opName> {
@@ -537,9 +539,10 @@ multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
-multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
-
+let IsDPMACCInstruction = 1 in {
+ multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+}
multiclass VOPCX_F16<string opName, string revOp = opName> {
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
@@ -556,8 +559,10 @@ multiclass VOPCX_F16<string opName, string revOp = opName> {
multiclass VOPCX_F32 <string opName, string revOp = opName> :
VOPCX_Pseudos <opName, VOPC_I1_F32_F32, VOPC_F32_F32, COND_NULL, revOp>;
-multiclass VOPCX_F64 <string opName, string revOp = opName> :
- VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>;
+let IsDPMACCInstruction = 1 in {
+ multiclass VOPCX_F64 <string opName, string revOp = opName> :
+ VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>;
+}
multiclass VOPCX_I16<string opName, string revOp = opName> {
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
@@ -574,8 +579,10 @@ multiclass VOPCX_I16<string opName, string revOp = opName> {
multiclass VOPCX_I32 <string opName, string revOp = opName> :
VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>;
-multiclass VOPCX_I64 <string opName, string revOp = opName> :
- VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+let IsDPMACCInstruction = 1 in {
+ multiclass VOPCX_I64 <string opName, string revOp = opName> :
+ VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+}
//===----------------------------------------------------------------------===//
@@ -1210,11 +1217,13 @@ multiclass VOPC_CLASS_F32 <string opName> {
multiclass VOPCX_CLASS_F32 <string opName> :
VOPCX_Class_Pseudos <opName, VOPC_I1_F32_I32, VOPC_F32_I32>;
+// FIXME: let IsDPMACCInstruction = 1 in
multiclass VOPC_CLASS_F64 <string opName> {
defm NAME : VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 0>;
defm : VOPCClassPat64<NAME>;
}
+// FIXME: let IsDPMACCInstruction = 1 in
multiclass VOPCX_CLASS_F64 <string opName> :
VOPCX_Class_Pseudos <opName, VOPC_I1_F64_I32, VOPC_F64_I32>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index ea3edb8ca6662..9664535b11539 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -18,6 +18,7 @@ class LetDummies {
bit isConvergent;
bit isAsCheapAsAMove;
bit FPDPRounding;
+ bit IsDPMACCInstruction;
Predicate SubtargetPredicate;
string Constraints;
string DisableEncoding;
@@ -71,6 +72,7 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
string Mnemonic = opName;
Instruction Opcode = !cast<Instruction>(NAME);
bit IsTrue16 = P.IsTrue16;
+ bit IsDPMACCInstruction = 0;
VOPProfile Pfl = P;
string AsmOperands;
@@ -166,6 +168,7 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
class VOP_Real<VOP_Pseudo ps> {
Instruction Opcode = !cast<Instruction>(NAME);
bit IsSingle = ps.Pfl.IsSingle;
+ bit IsDPMACCInstruction = ps.IsDPMACCInstruction;
}
class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
@@ -2278,3 +2281,12 @@ def VOPTrue16Table : GenericTable {
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getTrue16OpcodeHelper";
}
+
+def DPMACCInstructionTable : GenericTable {
+ let FilterClass = "VOP_Pseudo";
+ let CppTypeName = "DPMACCInstructionInfo";
+ let Fields = ["Opcode", "IsDPMACCInstruction"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getDPMACCInstructionHelper";
+}
diff --git a/llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx12.mir b/llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx12.mir
new file mode 100644
index 0000000000000..e615a83eb4da5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx12.mir
@@ -0,0 +1,454 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-software-hazard-mode -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-waitcnt-forcezero -run-pass si-insert-waitcnts %s -o - | FileCheck %s
+
+---
+# Make sure we don't output expert mode waitcnts with amdgpu-waitcnt-forcezero flag if hazard mode isn't enabled,
+# otherwise it will cause an assert.
+# CHECK-LABEL: name: raw_exp
+
+# GCN-LABEL: name: raw_exp
+# GCN: bb.0
+# GCN: S_WAITCNT_DEPCTR 3999
+# GCN-NEXT EXP_DONE
+
+name: raw_exp
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 1082130432, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 1073741824, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 1065353216, implicit $exec
+ $vgpr3 = V_MOV_B32_e32 1056964608, implicit $exec
+ EXP_DONE 15, $vgpr3, $vgpr2, $vgpr1, $vgpr0, 0, 0, 1, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+# GCN-LABEL: name: raw_vmem
+# GCN: bb.0
+# GCN: $vgpr0 = nofpexcept V_ADD_F32_e32
+# GCN-NEXT: S_WAITCNT_DEPCTR 3999
+# GCN-NEXT: IMAGE_STORE_V4_V1_gfx12
+
+name: raw_vmem
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0
+
+ $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
+ $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_LOAD_V4_V1_gfx12 $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ $vgpr3 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr3, implicit $mode, implicit $exec
+ $vgpr2 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr1, implicit $mode, implicit $exec
+ $vgpr0 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr0, implicit $mode, implicit $exec
+ IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+ SI_RETURN_TO_EPILOG $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+...
+
+---
+# GCN-LABEL: name: war_lds
+# GCN: bb.0
+# GCN: IMAGE_LOAD_V4_V1
+# GCN-NEXT: S_WAITCNT_DEPCTR 65411
+# GCN-NEXT DS_READ_B128
+
+name: war_lds
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1
+
+ $vgpr5_vgpr6_vgpr7_vgpr8 = IMAGE_LOAD_V4_V1_gfx12 $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ $vgpr0_vgpr1_vgpr2_vgpr3 = DS_READ_B128_gfx9 $vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr0 = nofpexcept V_ADD_F32_e32 $vgpr5, $vgpr0, implicit $mode, implicit $exec
+ $vgpr1 = nofpexcept V_ADD_F32_e32 $vgpr6, $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = nofpexcept V_ADD_F32_e32 $vgpr7, $vgpr2, implicit $mode, implicit $exec
+ $vgpr3 = nofpexcept V_ADD_F32_e32 $vgpr8, $vgpr3, implicit $mode, implicit $exec
+ SI_RETURN_TO_EPILOG $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+...
+
+---
+# GCN-LABEL: name: war_tex
+# GCN: bb.0
+# GCN: DS_WRITE_B128
+# GCN-NEXT: S_WAITCNT_DEPCTR 65411
+# GCN-NEXT: IMAGE_LOAD_V4_V1
+
+name: war_tex
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2
+
+ $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $exec
+ $vgpr0_vgpr1_vgpr2_vgpr3 = DS_READ_B128_gfx9 $vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ DS_WRITE_B128_gfx9 $vgpr5, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (store (s128), addrspace 3)
+ $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_LOAD_V4_V1_gfx12 $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ SI_RETURN_TO_EPILOG $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+...
+
+---
+# GCN-LABEL: name: war_valu
+# GCN: bb.0
+# GCN: S_WAITCNT_DEPCTR 65411
+# GCN-NEXT: V_MOV_B32_e32
+
+name: war_valu
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0
+
+ $vgpr5_vgpr6_vgpr7_vgpr8 = IMAGE_LOAD_V4_V1_gfx12 $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ $vgpr4 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr5, implicit $mode, implicit $exec
+ $vgpr1 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr6, implicit $mode, implicit $exec
+ $vgpr2 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr7, implicit $mode, implicit $exec
+ $vgpr3 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr8, implicit $mode, implicit $exec
+ IMAGE_STORE_V4_V1_gfx12 $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+ $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $exec
+ SI_RETURN_TO_EPILOG $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+...
+
+---
+# GCN-LABEL: name: war_vmem
+# GCN: bb.0
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: S_WAITCNT_DEPCTR 3999
+# GCN-NEXT: IMAGE_LOAD_V4_V1_gfx12
+# GCN: $vgpr0 = nofpexcept V_ADD_F32_e32
+# GCN-NEXT: S_WAITCNT_DEPCTR 3999
+# GCN-NEXT: IMAGE_STORE_V4_V1_gfx12
+
+name: war_vmem
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0
+
+ $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
+ $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_LOAD_V4_V1_gfx12 $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ $vgpr3 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr3, implicit $mode, implicit $exec
+ $vgpr2 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr1, implicit $mode, implicit $exec
+ $vgpr0 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr0, implicit $mode, implicit $exec
+ IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+ SI_RETURN_TO_EPILOG $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+...
+
+---
+# GCN-LABEL: name: waw_vmem
+# GCN: bb.0
+# GCN: V_MOV_B32_e32
+# GCN-NEXT: S_WAITCNT_DEPCTR 3999
+# GCN-NEXT: IMAGE_LOAD_V4_V1_gfx12
+
+name: waw_vmem
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0
+
+ $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
+ $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_LOAD_V4_V1_gfx12 $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ $vgpr3 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr7, implicit $mode, implicit $exec
+ $vgpr2 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr6, implicit $mode, implicit $exec
+ $vgpr1 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr5, implicit $mode, implicit $exec
+ $vgpr0 = nofpexcept V_ADD_F32_e32 1065353216, $vgpr4, implicit $mode, implicit $exec
+ SI_RETURN_TO_EPILOG $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+...
+
+---
+# GCN-LABEL: name: raw_valu_scratch
+# GCN: S_WAIT_LOADCNT_DSCNT 0
+# GCN-NEXT: S_WAIT_EXPCNT 0
+# GCN-NEXT: S_WAIT_SAMPLECNT 0
+# GCN-NEXT: S_WAIT_BVHCNT 0
+# GCN-NEXT: S_WAIT_KMCNT 0
+# GCN-NEXT: S_WAITCNT_DEPCTR 3971
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec
+# GCN-NEXT: S_WAITCNT_DEPCTR 8095
+# GCN-NEXT: $vgpr4 = SCRATCH_LOAD_UBYTE_SVS $vgpr2, $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
+
+name: raw_valu_scratch
+body: |
+ bb.0:
+ $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec
+ $vgpr4 = SCRATCH_LOAD_UBYTE_SVS $vgpr2, $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+# This pre-existing S_WAITCNT_DEPCTR should not be updated.
+
+# GCN-LABEL: name: valu_depctr_valu
+# GCN: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN: S_WAITCNT_DEPCTR 8167
+# GCN: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+name: valu_depctr_valu
+body: |
+ bb.0:
+ $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ S_WAITCNT_DEPCTR 8167
+ $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: test_xdl_csmacc_ooo_completion
+# GCN: $vgpr8 = V_ADD_F32_e32 244, $vgpr11, implicit $exec, implicit $mode
+# GCN-NEXT: S_WAITCNT_DEPCTR 3999
+# GCN-NEXT: IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+
+name: test_xdl_csmacc_ooo_completion
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18
+
+ $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ $vgpr8 = V_ADD_F32_e32 244, $vgpr11, implicit $exec, implicit $mode
+ IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+...
+
+---
+# GCN-LABEL: name: test_dpmacc_csmacc_ooo_completion
+# GCN: $vgpr8 = V_ADD_F32_e32 244, $vgpr11, implicit $exec, implicit $mode
+# GCN-NEXT: S_WAITCNT_DEPCTR 3999
+# GCN-NEXT: IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+
+name: test_dpmacc_csmacc_ooo_completion
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr11, $vgpr18
+
+ $vgpr0_vgpr1 = V_FRACT_F64_e32 $vgpr0_vgpr1, implicit $exec, implicit $mode
+ $vgpr8 = V_ADD_F32_e32 244, $vgpr11, implicit $exec, implicit $mode
+ IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+...
+
+---
+# GCN-LABEL: name: test_trans_csmacc_ooo_completion
+# GCN: $vgpr8 = V_ADD_F32_e32 244, $vgpr11, implicit $exec, implicit $mode
+# GCN-NEXT: S_WAITCNT_DEPCTR 3999
+# GCN-NEXT: IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+
+name: test_trans_csmacc_ooo_completion
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr11, $vgpr18
+
+ $vgpr0 = V_SQRT_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr8 = V_ADD_F32_e32 244, $vgpr11, implicit $exec, implicit $mode
+ IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+...
+
+---
+# GCN-LABEL: name: test_csmacc_csmacc_io_completion
+# GCN: $vgpr8 = V_ADD_F32_e32 244, $vgpr11, implicit $exec, implicit $mode
+# GCN-NEXT: S_WAITCNT_DEPCTR 8095
+# GCN-NEXT: IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+
+name: test_csmacc_csmacc_io_completion
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr11, $vgpr18
+
+ $vgpr0 = V_ADD_F32_e32 244, $vgpr0, implicit $exec, implicit $mode
+ $vgpr8 = V_ADD_F32_e32 244, $vgpr11, implicit $exec, implicit $mode
+ IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+...
+
+---
+# GCN-LABEL: name: test_dpmacc_dpmacc_io_completion
+# GCN: $vgpr4_vgpr5 = V_TRUNC_F64_e32 $vgpr2_vgpr3, implicit $exec, implicit $mode
+# GCN-NEXT: S_WAITCNT_DEPCTR 8095
+# GCN-NEXT: IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+
+name: test_dpmacc_dpmacc_io_completion
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr18
+
+ $vgpr0_vgpr1 = V_FRACT_F64_e32 $vgpr0_vgpr1, implicit $exec, implicit $mode
+ $vgpr4_vgpr5 = V_TRUNC_F64_e32 $vgpr2_vgpr3, implicit $exec, implicit $mode
+ IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+...
+
+---
+# GCN-LABEL: name: test_trans_trans_io_completion
+# GCN: $vgpr8 = V_LOG_F32_e32 $vgpr11, implicit $exec, implicit $mode
+# GCN-NEXT: S_WAITCNT_DEPCTR 8095
+# GCN-NEXT: IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+
+name: test_trans_trans_io_completion
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr11, $vgpr18
+
+ $vgpr0 = V_SQRT_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr8 = V_LOG_F32_e32 $vgpr11, implicit $exec, implicit $mode
+ IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+...
+
+---
+# GCN-LABEL: name: test_xdl_xdl_io_completion
+# GCN: $vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26, 0, 0, implicit $exec
+# GCN-NEXT: S_WAITCNT_DEPCTR 8095
+# GCN-NEXT: IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+
+name: test_xdl_xdl_io_completion
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+
+ $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13, 8, $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ $vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26, 0, 0, implicit $exec
+ IMAGE_STORE_V4_V1_gfx12 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+...
+
+---
+# GCN-LABEL: name: test_war_vmvsrc_vmem_vmem_io
+# GCN: bb.0
+# GCN: $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_LOAD_V4_V1
+# GCN-NEXT: S_WAITCNT_DEPCTR 65415
+# GCN-NEXT: V_ADD_F32
+# GCN-NEXT: S_WAITCNT_DEPCTR 65411
+# GCN-NEXT: V_ADD_F32
+
+name: test_war_vmvsrc_vmem_vmem_io
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1
+
+ $vgpr8_vgpr9_vgpr10_vgpr11 = IMAGE_LOAD_V4_V1_gfx12 $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_LOAD_V4_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ $vgpr0 = nofpexcept V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+ $vgpr1 = nofpexcept V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
+
+...
+
+---
+# GCN-LABEL: name: test_war_vmvsrc_flat_flat_io
+# GCN: bb.0
+# GCN: $vgpr5 = FLAT_LOAD_DWORD
+# GCN-NEXT: S_WAITCNT_DEPCTR 65415
+# GCN-NEXT: V_ADD_F32
+# GCN-NEXT: S_WAITCNT_DEPCTR 65411
+# GCN-NEXT: V_ADD_F32
+
+name: test_war_vmvsrc_flat_flat_io
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr5 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr0 = nofpexcept V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = nofpexcept V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+ SI_RETURN_TO_EPILOG $vgpr6, $vgpr7
+
+...
+
+---
+# GCN-LABEL: name: test_war_vmvsrc_lds_lds_io
+# GCN: bb.0
+# GCN: $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128
+# GCN-NEXT: S_WAITCNT_DEPCTR 65415
+# GCN-NEXT: V_ADD_F32
+# GCN-NEXT: S_WAITCNT_DEPCTR 65411
+# GCN-NEXT: V_ADD_F32
+
+name: test_war_vmvsrc_lds_lds_io
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ $vgpr8_vgpr9_vgpr10_vgpr11 = DS_READ_B128_gfx9 $vgpr0, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 $vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr0 = nofpexcept V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+ $vgpr1 = nofpexcept V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
+
+...
+
+---
+# GCN-LABEL: name: test_war_vmvsrc_vmem_lds_ooo
+# GCN: bb.0
+# GCN: DS_READ_B128
+# GCN-NEXT: S_WAITCNT_DEPCTR 65411
+# GCN-NEXT: V_ADD_F32
+
+name: test_war_vmvsrc_vmem_lds_ooo
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1
+
+ $vgpr8_vgpr9_vgpr10_vgpr11 = IMAGE_LOAD_V4_V1_gfx12 $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 $vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr0 = nofpexcept V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+ $vgpr1 = nofpexcept V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
+
+...
+
+---
+# GCN-LABEL: name: test_war_vmvsrc_flat_lds_ooo
+# GCN: bb.0
+# GCN: DS_READ_B128
+# GCN-NEXT: S_WAITCNT_DEPCTR 65411
+# GCN-NEXT: V_ADD_F32
+
+name: test_war_vmvsrc_flat_lds_ooo
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 $vgpr2, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+ $vgpr1 = nofpexcept V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = nofpexcept V_ADD_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ SI_RETURN_TO_EPILOG $vgpr1, $vgpr2
+
+...
+
+---
+# GCN-LABEL: name: test_war_vmvsrc_flat_vmem_ooo
+# GCN: bb.0
+# GCN: IMAGE_LOAD_V4_V1
+# GCN-NEXT: S_WAITCNT_DEPCTR 65411
+# GCN-NEXT: V_ADD_F32
+
+name: test_war_vmvsrc_flat_vmem_ooo
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2
+
+ $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_LOAD_V4_V1_gfx12 $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ $vgpr1 = nofpexcept V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = nofpexcept V_ADD_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ SI_RETURN_TO_EPILOG $vgpr1, $vgpr2
+
+...
+
+---
+# Ensure there is no unnecessary wait on vm_vsrc when load has already
+# completed.
+
+# GCN-LABEL: name: test_war_completed_vmem
+# GCN: GLOBAL_LOAD_DWORD
+# GCN-NEXT: S_WAIT_LOADCNT 0
+# GCN-NEXT: V_MOV_B32_e32
+# GCN-NEXT: V_MOV_B32_e32
+name: test_war_completed_vmem
+body: |
+ bb.0:
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 100, 0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/function-esm2-prologue-epilogue.ll b/llvm/test/CodeGen/AMDGPU/function-esm2-prologue-epilogue.ll
new file mode 100644
index 0000000000000..efe40c58f1609
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-esm2-prologue-epilogue.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1200 -amdgpu-software-hazard-mode | FileCheck %s --check-prefix=GFX12
+
+declare i16 @llvm.bswap.i16(i16) nounwind readnone
+
+define float @missing_truncate_promote_bswap(i32 %arg) {
+; GFX12-LABEL: missing_truncate_promote_bswap:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu depctr_va_vdst(0) depctr_vm_vsrc(0)
+; GFX12-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX12-NEXT: s_wait_alu depctr_va_vdst(0)
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %tmp = trunc i32 %arg to i16
+ %tmp1 = call i16 @llvm.bswap.i16(i16 %tmp)
+ %tmp2 = bitcast i16 %tmp1 to half
+ %tmp3 = fpext half %tmp2 to float
+ ret float %tmp3
+}
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index bd4d2d50738d5..36cf229d4cf4f 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX12 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefix=GFX12-SPREFETCH %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch -amdgpu-software-hazard-mode < %s | FileCheck --check-prefix=GFX12ES2-SPREFETCH %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX1250 %s
define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
@@ -64,6 +65,40 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
; GFX12-SPREFETCH-NEXT: .LBB0_3: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
;
+; GFX12ES2-SPREFETCH-LABEL: copy_flat:
+; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
+; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
+; GFX12ES2-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_3
+; GFX12ES2-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12ES2-SPREFETCH-NEXT: s_movk_i32 s4, 0xff50
+; GFX12ES2-SPREFETCH-NEXT: s_mov_b32 s5, -1
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GFX12ES2-SPREFETCH-NEXT: .LBB0_2: ; %for.body
+; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5]
+; GFX12ES2-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
+; GFX12ES2-SPREFETCH-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12ES2-SPREFETCH-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
+; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(1)
+; GFX12ES2-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1]
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
+; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX12ES2-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(0)
+; GFX12ES2-SPREFETCH-NEXT: flat_store_b128 v[4:5], v[0:3]
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_2
+; GFX12ES2-SPREFETCH-NEXT: .LBB0_3: ; %for.end
+; GFX12ES2-SPREFETCH-NEXT: s_endpgm
+;
; GFX1250-LABEL: copy_flat:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -160,6 +195,33 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp
; GFX12-SPREFETCH-NEXT: .LBB1_3: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
;
+; GFX12ES2-SPREFETCH-LABEL: copy_global:
+; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
+; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
+; GFX12ES2-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_3
+; GFX12ES2-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12ES2-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GFX12ES2-SPREFETCH-NEXT: .LBB1_2: ; %for.body
+; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(0) depctr_vm_vsrc(0)
+; GFX12ES2-SPREFETCH-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176
+; GFX12ES2-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
+; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
+; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
+; GFX12ES2-SPREFETCH-NEXT: s_wait_loadcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: global_store_b128 v0, v[1:4], s[0:1]
+; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_2
+; GFX12ES2-SPREFETCH-NEXT: .LBB1_3: ; %for.end
+; GFX12ES2-SPREFETCH-NEXT: s_endpgm
+;
; GFX1250-LABEL: copy_global:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -258,6 +320,35 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr
; GFX12-SPREFETCH-NEXT: .LBB2_3: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
;
+; GFX12ES2-SPREFETCH-LABEL: copy_constant:
+; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
+; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
+; GFX12ES2-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_3
+; GFX12ES2-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12ES2-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
+; GFX12ES2-SPREFETCH-NEXT: .LBB2_2: ; %for.body
+; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0
+; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
+; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
+; GFX12ES2-SPREFETCH-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
+; GFX12ES2-SPREFETCH-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(0)
+; GFX12ES2-SPREFETCH-NEXT: global_store_b128 v0, v[1:4], s[0:1]
+; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_2
+; GFX12ES2-SPREFETCH-NEXT: .LBB2_3: ; %for.end
+; GFX12ES2-SPREFETCH-NEXT: s_endpgm
+;
; GFX1250-LABEL: copy_constant:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -356,6 +447,35 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa
; GFX12-SPREFETCH-NEXT: .LBB3_2: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
;
+; GFX12ES2-SPREFETCH-LABEL: copy_local:
+; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
+; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
+; GFX12ES2-SPREFETCH-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s2, 0
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_2
+; GFX12ES2-SPREFETCH-NEXT: .LBB3_1: ; %for.body
+; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_sa_sdst(0) depctr_vm_vsrc(0)
+; GFX12ES2-SPREFETCH-NEXT: v_mov_b32_e32 v2, s1
+; GFX12ES2-SPREFETCH-NEXT: v_mov_b32_e32 v4, s0
+; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s2, s2, -1
+; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s0, s0, 16
+; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s1, s1, 16
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(1)
+; GFX12ES2-SPREFETCH-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
+; GFX12ES2-SPREFETCH-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s2, 0
+; GFX12ES2-SPREFETCH-NEXT: s_wait_dscnt 0x1
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(0)
+; GFX12ES2-SPREFETCH-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
+; GFX12ES2-SPREFETCH-NEXT: s_wait_dscnt 0x1
+; GFX12ES2-SPREFETCH-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX12ES2-SPREFETCH-NEXT: .LBB3_2: ; %for.end
+; GFX12ES2-SPREFETCH-NEXT: s_endpgm
+;
; GFX1250-LABEL: copy_local:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -483,6 +603,51 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r
; GFX12-SPREFETCH-NEXT: .LBB4_3: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
;
+; GFX12ES2-SPREFETCH-LABEL: copy_flat_divergent:
+; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
+; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
+; GFX12ES2-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_3
+; GFX12ES2-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX12ES2-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12ES2-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
+; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12ES2-SPREFETCH-NEXT: .LBB4_2: ; %for.body
+; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffff50, v2
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v3, vcc_lo
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(2)
+; GFX12ES2-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[4:5]
+; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12ES2-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: flat_store_b128 v[0:1], v[4:7]
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX12ES2-SPREFETCH-NEXT: .LBB4_3: ; %for.end
+; GFX12ES2-SPREFETCH-NEXT: s_endpgm
+;
; GFX1250-LABEL: copy_flat_divergent:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -617,6 +782,48 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d,
; GFX12-SPREFETCH-NEXT: .LBB5_3: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
;
+; GFX12ES2-SPREFETCH-LABEL: copy_global_divergent:
+; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
+; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
+; GFX12ES2-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX12ES2-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX12ES2-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12ES2-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
+; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12ES2-SPREFETCH-NEXT: .LBB5_2: ; %for.body
+; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(0)
+; GFX12ES2-SPREFETCH-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12ES2-SPREFETCH-NEXT: s_wait_loadcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT: global_store_b128 v[0:1], v[4:7], off
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
+; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_2
+; GFX12ES2-SPREFETCH-NEXT: .LBB5_3: ; %for.end
+; GFX12ES2-SPREFETCH-NEXT: s_endpgm
+;
; GFX1250-LABEL: copy_global_divergent:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
>From ad337b0eab1f3b363b66db80726f0b4d09950a61 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 4 Dec 2025 12:42:26 +0000
Subject: [PATCH 2/6] Rename software hazard to expert scheduling
---
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 7 ++++---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 16 ++++++++--------
.../CodeGen/AMDGPU/expert_scheduling_gfx12.mir | 2 +-
.../AMDGPU/function-esm2-prologue-epilogue.ll | 2 +-
llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll | 2 +-
5 files changed, 15 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d5c72e52c3cee..105ec0f4a9490 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1478,9 +1478,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
- /// \returns true if the target supports using software to avoid hazards
- /// between VMEM and VALU instructions in some instances.
- bool hasSoftwareHazardMode() const { return getGeneration() >= GFX12; }
+ /// \returns true if the target supports expert scheduling mode 2 which relies
+ /// on the compiler to insert waits to avoid hazards between VMEM and VALU
+ /// instructions in some instances.
+ bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
/// \returns true if the target has s_wait_xcnt insertion. Supported for
/// GFX1250.
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index a8613d586ed0f..41743cdedbef7 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -63,8 +63,8 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
cl::desc("Force all waitcnt load counters to wait until 0"),
cl::init(false), cl::Hidden);
-static cl::opt<bool> SoftwareHazardModeFlag(
- "amdgpu-software-hazard-mode",
+static cl::opt<bool> ExpertSchedulingModeFlag(
+ "amdgpu-expert-scheduling-mode",
cl::desc("Enable expert scheduling mode 2 for all kernel functions (GFX12+ "
"only)"),
cl::init(false), cl::Hidden);
@@ -602,7 +602,7 @@ class SIInsertWaitcnts {
}
std::optional<WaitEventType>
- getSoftwareHazardEventType(const MachineInstr &Inst) const;
+ getExpertSchedulingEventType(const MachineInstr &Inst) const;
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
@@ -2372,7 +2372,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
}
std::optional<WaitEventType>
-SIInsertWaitcnts::getSoftwareHazardEventType(const MachineInstr &Inst) const {
+SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
if (TII->isVALU(Inst)) {
// Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
// out-of-order with respect to each other, so each of these classes
@@ -2484,7 +2484,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
bool IsSMEMAccess = false;
if (isExpertMode(MaxCounter)) {
- if (const auto ET = getSoftwareHazardEventType(Inst))
+ if (const auto ET = getExpertSchedulingEventType(Inst))
ScoreBrackets->updateByEvent(*ET, Inst);
}
@@ -2979,11 +2979,11 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
if (ST->hasExtendedWaitCounts()) {
- if (ST->hasSoftwareHazardMode() &&
+ if (ST->hasExpertSchedulingMode() &&
(MF.getFunction()
- .getFnAttribute("amdgpu-software-hazard-mode")
+ .getFnAttribute("amdgpu-expert-scheduling-mode")
.getValueAsBool() ||
- SoftwareHazardModeFlag))
+ ExpertSchedulingModeFlag))
MaxCounter = NUM_EXPERT_INST_CNTS;
else
MaxCounter = NUM_EXTENDED_INST_CNTS;
diff --git a/llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx12.mir b/llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx12.mir
index e615a83eb4da5..b0b83a6066074 100644
--- a/llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx12.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-software-hazard-mode -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-expert-scheduling-mode -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-waitcnt-forcezero -run-pass si-insert-waitcnts %s -o - | FileCheck %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/function-esm2-prologue-epilogue.ll b/llvm/test/CodeGen/AMDGPU/function-esm2-prologue-epilogue.ll
index efe40c58f1609..6a80efb79abc4 100644
--- a/llvm/test/CodeGen/AMDGPU/function-esm2-prologue-epilogue.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-esm2-prologue-epilogue.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1200 -amdgpu-software-hazard-mode | FileCheck %s --check-prefix=GFX12
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1200 -amdgpu-expert-scheduling-mode | FileCheck %s --check-prefix=GFX12
declare i16 @llvm.bswap.i16(i16) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index 36cf229d4cf4f..2defcc823febe 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX12 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefix=GFX12-SPREFETCH %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch -amdgpu-software-hazard-mode < %s | FileCheck --check-prefix=GFX12ES2-SPREFETCH %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch -amdgpu-expert-scheduling-mode < %s | FileCheck --check-prefix=GFX12ES2-SPREFETCH %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX1250 %s
define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
>From 5c5975ad7d6238866e4498f0d39bf0286cce0920 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 5 Dec 2025 15:06:10 +0000
Subject: [PATCH 3/6] Use explicit IsExpertMode flag
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 45 +++++++++++----------
1 file changed, 23 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c32b0a39e9254..f84db432bb96a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -222,10 +222,6 @@ static bool isNormalMode(InstCounterType MaxCounter) {
}
#endif // NDEBUG
-static bool isExpertMode(InstCounterType MaxCounter) {
- return MaxCounter == NUM_EXPERT_INST_CNTS;
-}
-
VmemType getVmemType(const MachineInstr &Inst) {
assert(updateVMCntOnly(Inst));
if (!SIInstrInfo::isImage(Inst))
@@ -406,8 +402,14 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
};
class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
+protected:
+ bool IsExpertMode;
+
public:
- using WaitcntGenerator::WaitcntGenerator;
+ WaitcntGeneratorGFX12Plus() = default;
+ WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
+ InstCounterType MaxCounter, bool IsExpertMode)
+ : WaitcntGenerator(MF, MaxCounter), IsExpertMode(IsExpertMode) {}
bool
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -449,6 +451,7 @@ class SIInsertWaitcnts {
const MachineRegisterInfo *MRI = nullptr;
InstCounterType SmemAccessCounter;
InstCounterType MaxCounter;
+ bool IsExpertMode = false;
const unsigned *WaitEventMaskForInst;
private:
@@ -1644,7 +1647,7 @@ WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
AMDGPU::Waitcnt
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
- unsigned ExpertVal = isExpertMode(MaxCounter) ? 0 : ~0u;
+ unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
~0u /* XCNT */, ExpertVal, ExpertVal);
}
@@ -1990,7 +1993,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
}
if (Wait.hasWaitDepctr()) {
- assert(isExpertMode(MaxCounter));
+ assert(IsExpertMode);
unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.VmVsrc, *ST);
Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
@@ -2302,7 +2305,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (ForceEmitWaitcnt[X_CNT])
Wait.XCnt = 0;
// Only force emit VA_VDST and VM_VSRC if expert mode is enabled.
- if (isExpertMode(MaxCounter)) {
+ if (IsExpertMode) {
if (ForceEmitWaitcnt[VA_VDST])
Wait.VaVdst = 0;
if (ForceEmitWaitcnt[VM_VSRC])
@@ -2475,7 +2478,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
bool IsVMEMAccess = false;
bool IsSMEMAccess = false;
- if (isExpertMode(MaxCounter)) {
+ if (IsExpertMode) {
if (const auto ET = getExpertSchedulingEventType(Inst))
ScoreBrackets->updateByEvent(*ET, Inst);
}
@@ -2724,8 +2727,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Track pre-existing waitcnts that were added in earlier iterations or by
// the memory legalizer.
- if (isWaitInstr(Inst) || (isExpertMode(MaxCounter) &&
- Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
+ if (isWaitInstr(Inst) ||
+ (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
if (!OldWaitcntInstr)
OldWaitcntInstr = &Inst;
++Iter;
@@ -2974,15 +2977,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
if (ST->hasExtendedWaitCounts()) {
- if (ST->hasExpertSchedulingMode() &&
- (MF.getFunction()
- .getFnAttribute("amdgpu-expert-scheduling-mode")
- .getValueAsBool() ||
- ExpertSchedulingModeFlag))
- MaxCounter = NUM_EXPERT_INST_CNTS;
- else
- MaxCounter = NUM_EXTENDED_INST_CNTS;
- WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
+ IsExpertMode = ST->hasExpertSchedulingMode() &&
+ (MF.getFunction()
+ .getFnAttribute("amdgpu-expert-scheduling-mode")
+ .getValueAsBool() ||
+ ExpertSchedulingModeFlag);
+ MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
+ WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter, IsExpertMode);
WCG = &WCGGFX12Plus;
} else {
MaxCounter = NUM_NORMAL_INST_CNTS;
@@ -3051,7 +3052,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
TII->get(instrsForExtendedCounterTypes[CT]))
.addImm(0);
}
- if (isExpertMode(MaxCounter)) {
+ if (IsExpertMode) {
unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, *ST);
Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, 0);
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR))
@@ -3066,7 +3067,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
Modified = true;
- } else if (isExpertMode(MaxCounter)) {
+ } else if (IsExpertMode) {
for (MachineBasicBlock::iterator E = EntryBB.end();
I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
;
>From afd47214d42f7e78e3f1e1de97540bf826756661 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 5 Dec 2025 15:17:12 +0000
Subject: [PATCH 4/6] Make command line option take precedence over function
attribute
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index f84db432bb96a..83619a91f3f7f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2978,10 +2978,11 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
if (ST->hasExtendedWaitCounts()) {
IsExpertMode = ST->hasExpertSchedulingMode() &&
- (MF.getFunction()
- .getFnAttribute("amdgpu-expert-scheduling-mode")
- .getValueAsBool() ||
- ExpertSchedulingModeFlag);
+ (ExpertSchedulingModeFlag.getNumOccurrences()
+ ? ExpertSchedulingModeFlag
+ : MF.getFunction()
+ .getFnAttribute("amdgpu-expert-scheduling-mode")
+ .getValueAsBool());
MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter, IsExpertMode);
WCG = &WCGGFX12Plus;
>From 7a814fb9350a9b68f687ef01fd0ed01d8451f878 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 8 Dec 2025 14:01:11 +0000
Subject: [PATCH 5/6] Remove unneeded if
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 83619a91f3f7f..72534cd1bf88a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1997,17 +1997,14 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.VmVsrc, *ST);
Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
- if (Enc != 0xffff) {
- [[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(Enc);
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
- Modified = true;
+ Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
- }
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
}
return Modified;
>From 3dc8d946b71f3f6b39fe2e814419981ec13dcedf Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 8 Dec 2025 15:14:06 +0000
Subject: [PATCH 6/6] Use getDefaultDepCtrEncoding
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 72534cd1bf88a..45cc2ee34d9b7 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1753,7 +1753,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
- if (Enc != 0xffff) {
+ if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST)) {
Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
Modified |= promoteSoftWaitCnt(&II);
} else {
@@ -1912,7 +1912,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
// If that new encoded Depctr immediate would actually still wait
// for anything, update the instruction's operand. Otherwise it can
// just be deleted.
- if (Enc != 0xffff) {
+ if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST)) {
Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
AMDGPU::OpName::simm16, Enc);
LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
More information about the llvm-commits
mailing list