[llvm] [AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores (PR #145566)
Christudasan Devadasan via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 24 11:27:54 PDT 2025
https://github.com/cdevadas created https://github.com/llvm/llvm-project/pull/145566
This patch tracks the register operands of both VMEM (FLAT, MUBUF,
MTBUF) and SMEM load-store operations and inserts a S_WAIT_XCNT
instruction with sufficient wait-count before potentially redefining
them. For VMEM instructions, XNACK is returned in the same order as
they were issued and hence non-zero counter values can be inserted.
However, SMEM execution is out-of-order and so is their XNACK reception.
Thus, only zero counter value can be inserted to capture SMEM dependencies.
>From f1961c829cddd705d95613977584b4938499c7ca Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Sun, 23 Jul 2023 22:31:57 +0530
Subject: [PATCH] [AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM
load-stores
This patch tracks the register operands of both VMEM (FLAT, MUBUF,
MTBUF) and SMEM load-store operations and inserts a S_WAIT_XCNT
instruction with sufficient wait-count before potentially redefining
them. For VMEM instructions, XNACK is returned in the same order as
they were issued and hence non-zero counter values can be inserted.
However, SMEM execution is out-of-order and so is their XNACK reception.
Thus, only zero counter value can be inserted to capture SMEM dependencies.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 137 ++-
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 9 +
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 13 +-
llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll | 396 ++++++++
llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 922 ++++++++++++++++++
5 files changed, 1457 insertions(+), 20 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 9a7dd3c31e498..f43831016952a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -73,6 +73,7 @@ enum InstCounterType {
SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
BVH_CNT, // gfx12+ only.
KM_CNT, // gfx12+ only.
+ X_CNT, // gfx1250.
NUM_EXTENDED_INST_CNTS,
NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
};
@@ -102,6 +103,7 @@ struct HardwareLimits {
unsigned SamplecntMax; // gfx12+ only.
unsigned BvhcntMax; // gfx12+ only.
unsigned KmcntMax; // gfx12+ only.
+ unsigned XcntMax; // gfx1250.
};
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
@@ -111,10 +113,12 @@ struct HardwareLimits {
DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
+ DECL(VMEM_GROUP) /* vmem group */ \
DECL(LDS_ACCESS) /* lds read & write */ \
DECL(GDS_ACCESS) /* gds read & write */ \
DECL(SQ_MESSAGE) /* send message */ \
DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
+ DECL(SMEM_GROUP) /* scalar-memory group */ \
DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
DECL(EXP_POS_ACCESS) /* write to export position */ \
@@ -178,7 +182,7 @@ enum VmemType {
static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
- AMDGPU::S_WAIT_KMCNT};
+ AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
static bool updateVMCntOnly(const MachineInstr &Inst) {
return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
@@ -223,6 +227,8 @@ unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
return Wait.BvhCnt;
case KM_CNT:
return Wait.KmCnt;
+ case X_CNT:
+ return Wait.XCnt;
default:
llvm_unreachable("bad InstCounterType");
}
@@ -283,12 +289,27 @@ class WaitcntBrackets {
return Limits.BvhcntMax;
case KM_CNT:
return Limits.KmcntMax;
+ case X_CNT:
+ return Limits.XcntMax;
default:
break;
}
return 0;
}
+ bool isSmemCounter(InstCounterType T) const {
+ return T == SmemAccessCounter || T == X_CNT;
+ }
+
+ unsigned getSgprScoresIdx(InstCounterType T) const {
+ if (T == SmemAccessCounter)
+ return 0;
+ if (T == X_CNT)
+ return 1;
+
+ llvm_unreachable("Invalid SMEM counter");
+ }
+
unsigned getScoreLB(InstCounterType T) const {
assert(T < NUM_INST_CNTS);
return ScoreLBs[T];
@@ -307,8 +328,8 @@ class WaitcntBrackets {
if (GprNo < NUM_ALL_VGPRS) {
return VgprScores[T][GprNo];
}
- assert(T == SmemAccessCounter);
- return SgprScores[GprNo - NUM_ALL_VGPRS];
+ assert(isSmemCounter(T));
+ return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
}
bool merge(const WaitcntBrackets &Other);
@@ -331,6 +352,7 @@ class WaitcntBrackets {
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
+ void applyXcnt(const AMDGPU::Waitcnt &Wait);
void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI, WaitEventType E,
MachineInstr &MI);
@@ -462,9 +484,11 @@ class WaitcntBrackets {
int VgprUB = -1;
int SgprUB = -1;
unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
- // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
- // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
- unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+ // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+ // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+ // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
+ // X_CNT score.
+ unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
// write to each vgpr.
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
@@ -572,6 +596,7 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
0,
0,
+ 0,
0};
return WaitEventMaskForInstPreGFX12;
@@ -607,7 +632,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
eventMask({VMEM_SAMPLER_READ_ACCESS}),
eventMask({VMEM_BVH_READ_ACCESS}),
- eventMask({SMEM_ACCESS, SQ_MESSAGE})};
+ eventMask({SMEM_ACCESS, SQ_MESSAGE}),
+ eventMask({VMEM_GROUP, SMEM_GROUP})};
return WaitEventMaskForInstGFX12Plus;
}
@@ -743,9 +769,12 @@ class SIInsertWaitcnts {
return VmemReadMapping[getVmemType(Inst)];
}
+ bool hasXcnt() const { return ST->hasWaitXCnt(); }
+
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+ bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr,
@@ -837,9 +866,9 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
VgprUB = std::max(VgprUB, RegNo);
VgprScores[CntTy][RegNo] = Score;
} else {
- assert(CntTy == SmemAccessCounter);
+ assert(isSmemCounter(CntTy));
SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
- SgprScores[RegNo - NUM_ALL_VGPRS] = Score;
+ SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
}
}
}
@@ -976,6 +1005,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
}
}
+ } else if (T == X_CNT) {
+ for (const MachineOperand &Op : Inst.all_uses()) {
+ RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ setRegScore(RegNo, T, CurrScore);
+ }
+ }
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
// Match the score to the destination registers.
//
@@ -1080,6 +1116,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
case KM_CNT:
OS << " KM_CNT(" << SR << "): ";
break;
+ case X_CNT:
+ OS << " X_CNT(" << SR << "): ";
+ break;
default:
OS << " UNKNOWN(" << SR << "): ";
break;
@@ -1100,8 +1139,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
OS << RelScore << ":ds ";
}
}
- // Also need to print sgpr scores for lgkm_cnt.
- if (T == SmemAccessCounter) {
+ // Also need to print sgpr scores for lgkm_cnt or xcnt.
+ if (isSmemCounter(T)) {
for (int J = 0; J <= SgprUB; J++) {
unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
if (RegScore <= LB)
@@ -1140,6 +1179,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
simplifyWaitcnt(KM_CNT, Wait.KmCnt);
+ simplifyWaitcnt(X_CNT, Wait.XCnt);
}
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1191,6 +1231,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
applyWaitcnt(BVH_CNT, Wait.BvhCnt);
applyWaitcnt(KM_CNT, Wait.KmCnt);
+ applyXcnt(Wait);
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -1207,11 +1248,29 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
}
}
+void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+ // Wait on XCNT is redundant if we are already waiting for a load to complete.
+ // SMEM can return out of order, so only omit XCNT wait if we are waiting till
+ // zero.
+ if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
+ return applyWaitcnt(X_CNT, 0);
+
+ // If we have pending store we cannot optimize XCnt because we do not wait for
+ // stores. VMEM loads retun in order, so if we only have loads XCnt is
+ // decremented to the same number as LOADCnt.
+ if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
+ !hasPendingEvent(STORE_CNT))
+ return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+
+ applyWaitcnt(X_CNT, Wait.XCnt);
+}
+
// Where there are multiple types of event in the bracket of a counter,
// the decrement may go out of order.
bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
// Scalar memory read always can go out of order.
- if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
+ if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
+ (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
return true;
return hasMixedPendingEvents(T);
}
@@ -1263,6 +1322,8 @@ static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
return DS_CNT;
case AMDGPU::S_WAIT_KMCNT:
return KM_CNT;
+ case AMDGPU::S_WAIT_XCNT:
+ return X_CNT;
default:
return {};
}
@@ -1427,7 +1488,8 @@ WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
AMDGPU::Waitcnt
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
- return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
+ return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
+ ~0u /* XCNT */);
}
/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -1909,6 +1971,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
ScoreBrackets.clearVgprVmemTypes(Interval);
}
+
if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
}
@@ -1916,6 +1979,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
} else {
ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
}
+
+ if (hasXcnt() && Op.isDef())
+ ScoreBrackets.determineWait(X_CNT, Interval, Wait);
}
}
}
@@ -1958,6 +2024,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.BvhCnt = 0;
if (ForceEmitWaitcnt[KM_CNT])
Wait.KmCnt = 0;
+ if (ForceEmitWaitcnt[X_CNT])
+ Wait.XCnt = 0;
if (FlushVmCnt) {
if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
@@ -2007,6 +2075,21 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}
+ // XCnt may be already consumed by a load wait.
+ if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
+ !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
+ Wait.XCnt = ~0u;
+
+ if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
+ !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+ Wait.XCnt = ~0u;
+
+ // Since the translation for VMEM addresses occur in-order, we can skip the
+ // XCnt if the current instruction is of VMEM type and has a memory dependency
+ // with another VMEM instruction in flight.
+ if (Wait.XCnt != ~0u && isVmemAccess(*It))
+ Wait.XCnt = ~0u;
+
if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;
@@ -2096,6 +2179,11 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
});
}
+bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
+ return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
+ (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
+}
+
static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
auto Opc = Inst.getOpcode();
return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
@@ -2167,6 +2255,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// bracket and the destination operand scores.
// TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
+ bool IsVMEMAccess = false;
+ bool IsSMEMAccess = false;
if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
if (TII->isAlwaysGDS(Inst.getOpcode()) ||
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
@@ -2189,6 +2279,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
if (mayAccessVMEMThroughFlat(Inst)) {
++FlatASCount;
+ IsVMEMAccess = true;
ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
Inst);
}
@@ -2208,6 +2299,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->setPendingFlat();
} else if (SIInstrInfo::isVMEM(Inst) &&
!llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
+ IsVMEMAccess = true;
ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
Inst);
@@ -2216,6 +2308,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
}
} else if (TII->isSMRD(Inst)) {
+ IsSMEMAccess = true;
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
} else if (Inst.isCall()) {
if (callWaitsOnFunctionReturn(Inst)) {
@@ -2258,6 +2351,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
break;
}
}
+
+ if (!hasXcnt())
+ return;
+
+ if (IsVMEMAccess)
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst);
+
+ if (IsSMEMAccess)
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst);
}
bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
@@ -2311,9 +2413,11 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
for (int J = 0; J <= VgprUB; J++)
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
- if (T == SmemAccessCounter) {
+ if (isSmemCounter(T)) {
+ unsigned Idx = getSgprScoresIdx(T);
for (int J = 0; J <= SgprUB; J++)
- StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
+ StrictDom |=
+ mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
}
}
@@ -2651,6 +2755,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
+ Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
[[maybe_unused]] unsigned NumVGPRsMax =
ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
@@ -2679,7 +2784,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
.addImm(0);
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
- if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
+ if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
continue;
if (!ST->hasImageInsts() &&
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0e5493259edb9..13549e5c4e58b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -129,6 +129,11 @@ unsigned getKmcntBitWidth(unsigned VersionMajor) {
return VersionMajor >= 12 ? 5 : 0;
}
+/// \returns Xcnt bit width.
+unsigned getXcntBitWidth(unsigned VersionMajor, unsigned VersionMinor) {
+ return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0;
+}
+
/// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions.
unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) {
return VersionMajor >= 12 ? 8 : 0;
@@ -1493,6 +1498,10 @@ unsigned getKmcntBitMask(const IsaVersion &Version) {
return (1 << getKmcntBitWidth(Version.Major)) - 1;
}
+unsigned getXcntBitMask(const IsaVersion &Version) {
+ return (1 << getXcntBitWidth(Version.Major, Version.Minor)) - 1;
+}
+
unsigned getStorecntBitMask(const IsaVersion &Version) {
return (1 << getStorecntBitWidth(Version.Major)) - 1;
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ac7c5100be3d4..e6078d6918ac2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -980,6 +980,7 @@ struct Waitcnt {
unsigned SampleCnt = ~0u; // gfx12+ only.
unsigned BvhCnt = ~0u; // gfx12+ only.
unsigned KmCnt = ~0u; // gfx12+ only.
+ unsigned XCnt = ~0u; // gfx1250.
Waitcnt() = default;
// Pre-gfx12 constructor.
@@ -988,15 +989,15 @@ struct Waitcnt {
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
- unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt)
+ unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt)
: LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
- SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {}
+ SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt) {}
bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
bool hasWaitExceptStoreCnt() const {
return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u ||
- SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u;
+ SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u;
}
bool hasWaitStoreCnt() const { return StoreCnt != ~0u; }
@@ -1008,7 +1009,7 @@ struct Waitcnt {
std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt),
std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt),
std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt),
- std::min(KmCnt, Other.KmCnt));
+ std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt));
}
};
@@ -1114,6 +1115,10 @@ unsigned getDscntBitMask(const IsaVersion &Version);
/// Returns 0 for versions that do not support KMcnt
unsigned getKmcntBitMask(const IsaVersion &Version);
+/// \returns Xcnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support Xcnt.
+unsigned getXcntBitMask(const IsaVersion &Version);
+
/// \return STOREcnt or VScnt bit mask for given isa \p Version.
/// returns 0 for versions that do not support STOREcnt or VScnt.
/// STOREcnt and VScnt are the same counter, the name used
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
new file mode 100644
index 0000000000000..f86216837fe0e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -0,0 +1,396 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
+
+; Test S_WAIT_XCNT insertion for global_load/store instructions.
+; Introduced additional operations in between the clauses to have the register dependency
+; between the operands of VMEM operations and the def ops of VALU instructions that followed.
+
+define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %ptr_b, ptr addrspace(1) %ptr_c, ptr addrspace(1) %ptr_d, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: test_i8load_v4i8store:
+; GCN-SDAG: ; %bb.0:
+; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT: global_load_u8 v2, v[2:3], off
+; GCN-SDAG-NEXT: global_load_u8 v3, v[4:5], off
+; GCN-SDAG-NEXT: global_load_u8 v0, v[0:1], off
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x2
+; GCN-SDAG-NEXT: s_wait_xcnt 0x0
+; GCN-SDAG-NEXT: v_lshlrev_b16 v1, 8, v2
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x1
+; GCN-SDAG-NEXT: v_lshlrev_b16 v2, 8, v3
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-SDAG-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-SDAG-NEXT: global_store_b32 v[8:9], v0, off
+; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: test_i8load_v4i8store:
+; GCN-GISEL: ; %bb.0:
+; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT: global_load_u8 v0, v[0:1], off
+; GCN-GISEL-NEXT: global_load_u8 v1, v[2:3], off
+; GCN-GISEL-NEXT: global_load_u8 v2, v[4:5], off
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x1
+; GCN-GISEL-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GCN-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GCN-GISEL-NEXT: global_store_b32 v[8:9], v0, off
+; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
+ %a = load i8, ptr addrspace(1) %ptr_a
+ %b = load i8, ptr addrspace(1) %ptr_b
+ %c = load i8, ptr addrspace(1) %ptr_c
+ %d = load i8, ptr addrspace(1) %ptr_d
+ %ins_0 = insertelement <4 x i8> undef, i8 %a, i32 0
+ %ins_1 = insertelement <4 x i8> %ins_0, i8 %b, i32 1
+ %ins_2 = insertelement <4 x i8> %ins_1, i8 %c, i32 2
+ %ins_3 = insertelement <4 x i8> %ins_2, i8 %c, i32 3
+ store <4 x i8> %ins_3, ptr addrspace(1) %out
+ ret void
+}
+
+define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) {
+; GCN-SDAG-LABEL: test_v7i16_load_store:
+; GCN-SDAG: ; %bb.0:
+; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT: global_load_b128 v[4:7], v[0:1], off
+; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[2:3], off
+; GCN-SDAG-NEXT: v_mov_b32_e32 v8, 0
+; GCN-SDAG-NEXT: v_mov_b32_e32 v9, 0
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: v_pk_add_u16 v10, v6, v2
+; GCN-SDAG-NEXT: v_pk_add_u16 v11, v7, v3
+; GCN-SDAG-NEXT: s_wait_xcnt 0x0
+; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 12
+; GCN-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v6, 8
+; GCN-SDAG-NEXT: v_pk_add_u16 v4, v4, v0
+; GCN-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GCN-SDAG-NEXT: v_mov_b32_e32 v7, 0
+; GCN-SDAG-NEXT: v_pk_add_u16 v5, v5, v1
+; GCN-SDAG-NEXT: s_clause 0x2
+; GCN-SDAG-NEXT: global_store_b16 v[2:3], v11, off
+; GCN-SDAG-NEXT: global_store_b32 v[6:7], v10, off
+; GCN-SDAG-NEXT: global_store_b64 v[8:9], v[4:5], off
+; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: test_v7i16_load_store:
+; GCN-GISEL: ; %bb.0:
+; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off
+; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[2:3], off
+; GCN-GISEL-NEXT: v_mov_b32_e32 v8, 0
+; GCN-GISEL-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 2
+; GCN-GISEL-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v12, 4
+; GCN-GISEL-NEXT: v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 6
+; GCN-GISEL-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 8
+; GCN-GISEL-NEXT: v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 10
+; GCN-GISEL-NEXT: v_dual_mov_b32 v20, 12 :: v_dual_mov_b32 v19, 0
+; GCN-GISEL-NEXT: v_mov_b32_e32 v21, 0
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: v_pk_add_u16 v2, v6, v2
+; GCN-GISEL-NEXT: v_pk_add_u16 v4, v4, v0
+; GCN-GISEL-NEXT: v_pk_add_u16 v1, v5, v1
+; GCN-GISEL-NEXT: v_pk_add_u16 v3, v7, v3
+; GCN-GISEL-NEXT: s_clause 0x6
+; GCN-GISEL-NEXT: global_store_b16 v[8:9], v4, off
+; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[10:11], v4, off
+; GCN-GISEL-NEXT: global_store_b16 v[12:13], v1, off
+; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[14:15], v1, off
+; GCN-GISEL-NEXT: global_store_b16 v[16:17], v2, off
+; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[18:19], v2, off
+; GCN-GISEL-NEXT: global_store_b16 v[20:21], v3, off
+; GCN-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
+ %vec1 = load <7 x i16>, ptr addrspace(1) %ptr1
+ %insert = insertelement <7 x i16> %vec1, i16 20, i32 4
+ %vec2 = load <7 x i16>, ptr addrspace(1) %ptr2
+ %add = add <7 x i16> %vec1, %vec2
+ store <7 x i16> %add, ptr addrspace(1) null
+ %elt = extractelement <7 x i16> %add, i32 5
+ ret i16 %elt
+}
+
+define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: test_v64i32_load_store:
+; GCN-SDAG: ; %bb.0:
+; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT: s_clause 0xc
+; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:48
+; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:44
+; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:40
+; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:36
+; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:32
+; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28
+; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24
+; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20
+; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16
+; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12
+; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8
+; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4
+; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32
+; GCN-SDAG-NEXT: global_load_b128 v[5:8], v[0:1], off offset:224
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: scratch_store_b128 off, v[5:8], s32 offset:68 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT: global_load_b128 v[5:8], v[0:1], off offset:240
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: scratch_store_b128 off, v[5:8], s32 offset:84 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT: s_clause 0xc
+; GCN-SDAG-NEXT: global_load_b128 v[13:16], v[0:1], off offset:192
+; GCN-SDAG-NEXT: global_load_b128 v[17:20], v[0:1], off offset:208
+; GCN-SDAG-NEXT: global_load_b128 v[21:24], v[0:1], off offset:160
+; GCN-SDAG-NEXT: global_load_b128 v[25:28], v[0:1], off offset:176
+; GCN-SDAG-NEXT: global_load_b128 v[29:32], v[0:1], off offset:128
+; GCN-SDAG-NEXT: global_load_b128 v[33:36], v[0:1], off offset:144
+; GCN-SDAG-NEXT: global_load_b128 v[48:51], v[0:1], off offset:96
+; GCN-SDAG-NEXT: global_load_b128 v[52:55], v[0:1], off offset:112
+; GCN-SDAG-NEXT: global_load_b128 v[37:40], v[0:1], off offset:64
+; GCN-SDAG-NEXT: global_load_b128 v[41:44], v[0:1], off offset:80
+; GCN-SDAG-NEXT: global_load_b128 v[56:59], v[0:1], off offset:32
+; GCN-SDAG-NEXT: global_load_b128 v[60:63], v[0:1], off offset:48
+; GCN-SDAG-NEXT: global_load_b128 v[5:8], v[0:1], off
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: scratch_store_b128 off, v[5:8], s32 offset:52 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT: global_load_b128 v[5:8], v[0:1], off offset:16
+; GCN-SDAG-NEXT: scratch_load_b128 v[9:12], off, s32 offset:68 th:TH_LOAD_LU ; 16-byte Folded Reload
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x1
+; GCN-SDAG-NEXT: s_wait_xcnt 0x1
+; GCN-SDAG-NEXT: v_mov_b32_e32 v0, v7
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[9:12], off offset:224
+; GCN-SDAG-NEXT: scratch_load_b128 v[9:12], off, s32 offset:84 th:TH_LOAD_LU ; 16-byte Folded Reload
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: s_clause 0xc
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[9:12], off offset:240
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[13:16], off offset:192
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[17:20], off offset:208
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[21:24], off offset:160
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[25:28], off offset:176
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[29:32], off offset:128
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[33:36], off offset:144
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[48:51], off offset:96
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[52:55], off offset:112
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[37:40], off offset:64
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[41:44], off offset:80
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[56:59], off offset:32
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[60:63], off offset:48
+; GCN-SDAG-NEXT: scratch_load_b128 v[9:12], off, s32 offset:52 th:TH_LOAD_LU ; 16-byte Folded Reload
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: s_clause 0x1
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[9:12], off
+; GCN-SDAG-NEXT: global_store_b128 v[3:4], v[5:8], off offset:16
+; GCN-SDAG-NEXT: s_clause 0xc
+; GCN-SDAG-NEXT: scratch_load_b32 v63, off, s32
+; GCN-SDAG-NEXT: scratch_load_b32 v62, off, s32 offset:4
+; GCN-SDAG-NEXT: scratch_load_b32 v61, off, s32 offset:8
+; GCN-SDAG-NEXT: scratch_load_b32 v60, off, s32 offset:12
+; GCN-SDAG-NEXT: scratch_load_b32 v59, off, s32 offset:16
+; GCN-SDAG-NEXT: scratch_load_b32 v58, off, s32 offset:20
+; GCN-SDAG-NEXT: scratch_load_b32 v57, off, s32 offset:24
+; GCN-SDAG-NEXT: scratch_load_b32 v56, off, s32 offset:28
+; GCN-SDAG-NEXT: scratch_load_b32 v44, off, s32 offset:32
+; GCN-SDAG-NEXT: scratch_load_b32 v43, off, s32 offset:36
+; GCN-SDAG-NEXT: scratch_load_b32 v42, off, s32 offset:40
+; GCN-SDAG-NEXT: scratch_load_b32 v41, off, s32 offset:44
+; GCN-SDAG-NEXT: scratch_load_b32 v40, off, s32 offset:48
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: test_v64i32_load_store:
+; GCN-GISEL: ; %bb.0:
+; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT: s_clause 0xc
+; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:48
+; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:44
+; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:40
+; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:36
+; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:32
+; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28
+; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24
+; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20
+; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16
+; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12
+; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8
+; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4
+; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32
+; GCN-GISEL-NEXT: global_load_b128 v[5:8], v[0:1], off offset:32
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: scratch_store_b128 off, v[5:8], s32 offset:52 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT: global_load_b128 v[5:8], v[0:1], off offset:48
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: scratch_store_b128 off, v[5:8], s32 offset:68 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT: s_clause 0xd
+; GCN-GISEL-NEXT: global_load_b128 v[13:16], v[0:1], off offset:64
+; GCN-GISEL-NEXT: global_load_b128 v[17:20], v[0:1], off offset:80
+; GCN-GISEL-NEXT: global_load_b128 v[21:24], v[0:1], off offset:96
+; GCN-GISEL-NEXT: global_load_b128 v[25:28], v[0:1], off offset:112
+; GCN-GISEL-NEXT: global_load_b128 v[29:32], v[0:1], off offset:128
+; GCN-GISEL-NEXT: global_load_b128 v[33:36], v[0:1], off offset:144
+; GCN-GISEL-NEXT: global_load_b128 v[48:51], v[0:1], off offset:160
+; GCN-GISEL-NEXT: global_load_b128 v[52:55], v[0:1], off offset:176
+; GCN-GISEL-NEXT: global_load_b128 v[37:40], v[0:1], off offset:192
+; GCN-GISEL-NEXT: global_load_b128 v[41:44], v[0:1], off offset:208
+; GCN-GISEL-NEXT: global_load_b128 v[56:59], v[0:1], off offset:224
+; GCN-GISEL-NEXT: global_load_b128 v[60:63], v[0:1], off
+; GCN-GISEL-NEXT: global_load_b128 v[5:8], v[0:1], off offset:16
+; GCN-GISEL-NEXT: global_load_b128 v[9:12], v[0:1], off offset:240
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x1
+; GCN-GISEL-NEXT: s_wait_xcnt 0x0
+; GCN-GISEL-NEXT: v_mov_b32_e32 v0, v7
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: scratch_store_b128 off, v[9:12], s32 offset:84 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT: scratch_load_b128 v[9:12], off, s32 offset:52 th:TH_LOAD_LU ; 16-byte Folded Reload
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[9:12], off offset:32
+; GCN-GISEL-NEXT: scratch_load_b128 v[9:12], off, s32 offset:68 th:TH_LOAD_LU ; 16-byte Folded Reload
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: s_clause 0xd
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[9:12], off offset:48
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[13:16], off offset:64
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[17:20], off offset:80
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[21:24], off offset:96
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[25:28], off offset:112
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[29:32], off offset:128
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[33:36], off offset:144
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[48:51], off offset:160
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[52:55], off offset:176
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[37:40], off offset:192
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[41:44], off offset:208
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[56:59], off offset:224
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[60:63], off
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[5:8], off offset:16
+; GCN-GISEL-NEXT: scratch_load_b128 v[8:11], off, s32 offset:84 th:TH_LOAD_LU ; 16-byte Folded Reload
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: global_store_b128 v[3:4], v[8:11], off offset:240
+; GCN-GISEL-NEXT: s_clause 0xc
+; GCN-GISEL-NEXT: scratch_load_b32 v63, off, s32
+; GCN-GISEL-NEXT: scratch_load_b32 v62, off, s32 offset:4
+; GCN-GISEL-NEXT: scratch_load_b32 v61, off, s32 offset:8
+; GCN-GISEL-NEXT: scratch_load_b32 v60, off, s32 offset:12
+; GCN-GISEL-NEXT: scratch_load_b32 v59, off, s32 offset:16
+; GCN-GISEL-NEXT: scratch_load_b32 v58, off, s32 offset:20
+; GCN-GISEL-NEXT: scratch_load_b32 v57, off, s32 offset:24
+; GCN-GISEL-NEXT: scratch_load_b32 v56, off, s32 offset:28
+; GCN-GISEL-NEXT: scratch_load_b32 v44, off, s32 offset:32
+; GCN-GISEL-NEXT: scratch_load_b32 v43, off, s32 offset:36
+; GCN-GISEL-NEXT: scratch_load_b32 v42, off, s32 offset:40
+; GCN-GISEL-NEXT: scratch_load_b32 v41, off, s32 offset:44
+; GCN-GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:48
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
+ %vec = load <64 x i32>, ptr addrspace(1) %ptr
+ store <64 x i32> %vec, ptr addrspace(1) %out, align 4
+ %elt = extractelement <64 x i32> %vec, i32 6
+ ret i32 %elt
+}
+
+;TODO: This test should be enabled in the upstream later. It currently causes a crash
+; during branch relaxation as the gfx1250 real opcode definition for V_LSHL_ADD_U64
+; is not yet upstreamed.
+;define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %ptr_b, ptr addrspace(1) %out) {
+; %a = load <16 x i64>, ptr addrspace(1) %ptr_a, align 4
+; %in_a = insertelement <16 x i64> %a, i64 100, i32 5
+; store <16 x i64> %in_a, ptr addrspace(1) null
+; %b = load <16 x i64>, ptr addrspace(1) %ptr_b, align 4
+; %in_b = insertelement <16 x i64> %a, i64 200, i32 10
+; store <16 x i64> %in_b, ptr addrspace(1) null
+; %add = add <16 x i64> %in_a, %in_b
+; store <16 x i64> %add, ptr addrspace(1) %out, align 4
+; %elt = extractelement <16 x i64> %add, i32 1
+; ret i64 %elt
+;}
+
+define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: test_v7i16_load_store_kernel:
+; GCN-SDAG: ; %bb.0:
+; GCN-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-SDAG-NEXT: v_mov_b32_e32 v8, 12
+; GCN-SDAG-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 8
+; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GCN-SDAG-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_lshlrev_b32 v4, 4, v0
+; GCN-SDAG-NEXT: s_wait_xcnt 0x0
+; GCN-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GCN-SDAG-NEXT: v_mov_b32_e32 v12, 0
+; GCN-SDAG-NEXT: v_mov_b32_e32 v13, 0
+; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT: s_clause 0x1
+; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1]
+; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3]
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: v_pk_add_u16 v3, v3, v7
+; GCN-SDAG-NEXT: v_pk_add_u16 v2, v2, v6
+; GCN-SDAG-NEXT: v_pk_add_u16 v1, v1, v5
+; GCN-SDAG-NEXT: v_pk_add_u16 v0, v0, v4
+; GCN-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GCN-SDAG-NEXT: s_clause 0x2
+; GCN-SDAG-NEXT: global_store_b16 v[8:9], v3, off
+; GCN-SDAG-NEXT: global_store_b32 v[10:11], v2, off
+; GCN-SDAG-NEXT: global_store_b64 v[12:13], v[0:1], off
+; GCN-SDAG-NEXT: global_store_d16_hi_b16 v4, v2, s[4:5]
+; GCN-SDAG-NEXT: s_endpgm
+;
+; GCN-GISEL-LABEL: test_v7i16_load_store_kernel:
+; GCN-GISEL: ; %bb.0:
+; GCN-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GCN-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-GISEL-NEXT: v_mov_b32_e32 v8, 0
+; GCN-GISEL-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 2
+; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GCN-GISEL-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_lshlrev_b32 v4, 4, v0
+; GCN-GISEL-NEXT: s_wait_xcnt 0x0
+; GCN-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GCN-GISEL-NEXT: v_mov_b32_e32 v12, 4
+; GCN-GISEL-NEXT: v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 6
+; GCN-GISEL-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 8
+; GCN-GISEL-NEXT: v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 10
+; GCN-GISEL-NEXT: v_dual_mov_b32 v20, 12 :: v_dual_mov_b32 v19, 0
+; GCN-GISEL-NEXT: v_mov_b32_e32 v21, 0
+; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT: s_clause 0x1
+; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1]
+; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3]
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: v_pk_add_u16 v0, v0, v4
+; GCN-GISEL-NEXT: v_pk_add_u16 v1, v1, v5
+; GCN-GISEL-NEXT: v_pk_add_u16 v2, v2, v6
+; GCN-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GCN-GISEL-NEXT: v_pk_add_u16 v3, v3, v7
+; GCN-GISEL-NEXT: s_clause 0x6
+; GCN-GISEL-NEXT: global_store_b16 v[8:9], v0, off
+; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[10:11], v0, off
+; GCN-GISEL-NEXT: global_store_b16 v[12:13], v1, off
+; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[14:15], v1, off
+; GCN-GISEL-NEXT: global_store_b16 v[16:17], v2, off
+; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[18:19], v2, off
+; GCN-GISEL-NEXT: global_store_b16 v[20:21], v3, off
+; GCN-GISEL-NEXT: global_store_d16_hi_b16 v4, v2, s[4:5]
+; GCN-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr inbounds <7 x i16>, ptr addrspace(1) %ptr1, i32 %tid
+ %gep2 = getelementptr inbounds <7 x i16>, ptr addrspace(1) %ptr2, i32 %tid
+ %vec1 = load <7 x i16>, ptr addrspace(1) %gep1
+ %insert = insertelement <7 x i16> %vec1, i16 20, i32 4
+ %vec2 = load <7 x i16>, ptr addrspace(1) %gep2
+ %add = add <7 x i16> %vec1, %vec2
+ store <7 x i16> %add, ptr addrspace(1) null
+ %elt = extractelement <7 x i16> %add, i32 5
+ store i16 %elt, ptr addrspace(1) %out
+ ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
new file mode 100644
index 0000000000000..73b994ab2ab8c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -0,0 +1,922 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: vmem_scratch_load
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: vmem_scratch_load
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+ $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name: vmem_buffer_load_dword_offset
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-LABEL: name: vmem_buffer_load_dword_offset
+ ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+ $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name: vmem_buffer_load_addr
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-LABEL: name: vmem_buffer_load_addr
+ ; GCN: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_ADDR64 $vgpr0_vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+ $vgpr2 = BUFFER_LOAD_DWORD_ADDR64 $vgpr0_vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name: vmem_flat_load
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GCN-LABEL: name: vmem_flat_load
+ ; GCN: liveins: $vgpr0, $vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+ $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name: vmem_global_load
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GCN-LABEL: name: vmem_global_load
+ ; GCN: liveins: $vgpr0, $vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+ $vgpr2 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec:: (load (s32) from `float addrspace(1)* undef`, align 4, addrspace 1)
+ $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name: vmem_global_store
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-LABEL: name: vmem_global_store
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr3, implicit $exec
+ GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr3, implicit $exec
+...
+
+---
+name: vmem_buffer_store
+tracksRegLiveness: true
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GCN-LABEL: name: vmem_buffer_store
+ ; GCN: liveins: $vgpr0, $vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_LOADCNT_DSCNT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $vgpr0 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+ BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec
+ $vgpr0 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+...
+
+---
+name: vmem_scratch_store
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GCN-LABEL: name: vmem_scratch_store
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_LOADCNT_DSCNT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+ SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+...
+
+---
+name: smem_load
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr2_sgpr3
+ ; GCN-LABEL: name: smem_load
+ ; GCN: liveins: $sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr2_sgpr3, 0, 0 :: (load (s64), addrspace 4)
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 0
+ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr2_sgpr3, 0, 0 :: (load (s64), addrspace 4)
+ $sgpr2 = S_MOV_B32 0
+...
+
+---
+name: smem_store
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr2, $sgpr3
+ ; GCN-LABEL: name: smem_store
+ ; GCN: liveins: $sgpr0, $sgpr2, $sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr0, $sgpr2_sgpr3, 0, 0
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $sgpr3 = S_MOV_B32 0
+ S_STORE_DWORD_IMM $sgpr0, $sgpr2_sgpr3, 0, 0
+ $sgpr3 = S_MOV_B32 0
+...
+
+# 4 global_load instructions together form a load-group.
+
+---
+name: vmem_load_group
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr10
+ ; GCN-LABEL: name: vmem_load_group
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+ ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+ ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 2
+ ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+ $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+ $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+ $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+ $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+...
+
+# The contiguous stores form a single group.
+
+---
+name: vmem_store_group
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GCN-LABEL: name: vmem_store_group
+ ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+ $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+ $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+...
+
+---
+name: smem_load_group
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GCN-LABEL: name: smem_load_group
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+ ; GCN-NEXT: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+ ; GCN-NEXT: $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+ ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 0
+ $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+ $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+ $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+ $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+ $sgpr2 = S_MOV_B32 0
+...
+
+---
+name: smem_store_group
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GCN-LABEL: name: smem_store_group
+ ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr2, $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr3, $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr4, $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr3 = S_MOV_B32 0
+ S_STORE_DWORD_IMM $sgpr2, $sgpr0_sgpr1, 0, 0
+ S_STORE_DWORD_IMM $sgpr3, $sgpr0_sgpr1, 0, 0
+ S_STORE_DWORD_IMM $sgpr4, $sgpr0_sgpr1, 0, 0
+ S_STORE_DWORD_IMM $sgpr5, $sgpr0_sgpr1, 0, 0
+ $sgpr2 = S_MOV_B32 0
+ $sgpr3 = S_MOV_B32 0
+...
+
+# The four global_load instructions form two separate groups due to the interveing s_nop.
+
+---
+name: vmem_loads_with_an_intervening_nop
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr10
+ ; GCN-LABEL: name: vmem_loads_with_an_intervening_nop
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+ ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 2
+ ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+ $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+ S_NOP 0
+ $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+ $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+ $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+...
+
+---
+name: vmem_contiguous_loads_with_an_intervening_store
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr10
+ ; GCN-LABEL: name: vmem_contiguous_loads_with_an_intervening_store
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 1
+ ; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr2_vgpr3, 32, 0, implicit $exec
+ ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+ ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 2
+ ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+ $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+ GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr2_vgpr3, 32, 0, implicit $exec
+ $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+ $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+ $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+...
+
+---
+name: vmem_stores_with_intervening_nop
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GCN-LABEL: name: vmem_stores_with_intervening_nop
+ ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+ $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ S_NOP 0
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+ $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+...
+
+# The intervening load breaks the store group and form two distict store groups.
+
+---
+name: vmem_contiguous_stores_with_an_intervening_load
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GCN-LABEL: name: vmem_contiguous_stores_with_an_intervening_load
+ ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr11 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+ $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ $vgpr11 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+ $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+...
+
+# Atomic operations should not form a group. But they are memory instructions and should increment
+# the xcnt counter value as they might cause register dependnecy. This test ensures S_WAIT_XCNT
+# insertion for such cases.
+
+---
+name: atomic_op
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-LABEL: name: atomic_op
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = BUFFER_ATOMIC_ADD_ADDR64_RTN $vgpr2, $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 1, implicit $exec :: (load store (s32), addrspace 1)
+ ; GCN-NEXT: GLOBAL_ATOMIC_ADD_F32 $vgpr4_vgpr5, killed renamable $vgpr3, 0, 0, implicit $exec :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 1)
+ ; GCN-NEXT: $vgpr6 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: S_WAIT_XCNT 2
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 1
+ ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr2 = BUFFER_ATOMIC_ADD_ADDR64_RTN $vgpr2, $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 1, implicit $exec :: (load store (s32), addrspace 1)
+ GLOBAL_ATOMIC_ADD_F32 $vgpr4_vgpr5, killed renamable $vgpr3, 0, 0, implicit $exec :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 1)
+ $vgpr6 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+...
+
+# Force insert S_WAIT_XCNT 0 for dependency in SMEM instruction even though
+# there is a pending VMEM dependency.
+
+---
+name: smem_xcnt_insertion_with_pending_vmem_event
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GCN-LABEL: name: smem_xcnt_insertion_with_pending_vmem_event
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+ ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 4, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec
+ $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+ $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+ $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 4, implicit $exec
+ GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec
+ $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc
+ $vgpr0 = V_MOV_B32_e32 20, implicit $exec
+...
+
+# The second instruction in the flat_load group has a WAR dependency with a prior
+# memory opeartion (scratch_load instruction).
+
+---
+name: vmem_group_reg_dependency_with_prior_instruction
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr4, $vgpr5
+ ; GCN-LABEL: name: vmem_group_reg_dependency_with_prior_instruction
+ ; GCN: liveins: $vgpr4, $vgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+...
+
+# Two instructions inside the load group have dependencies with prior instructions.
+
+---
+name: multiple_xcnt_insertion_in_group
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr3, $vgpr4, $vgpr5
+ ; GCN-LABEL: name: multiple_xcnt_insertion_in_group
+ ; GCN: liveins: $vgpr3, $vgpr4, $vgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr3, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ ; GCN-NEXT: $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ ; GCN-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ $vgpr2 = SCRATCH_LOAD_DWORD $vgpr3, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+ $vgpr8 = V_MOV_B32_e32 1, implicit $exec
+...
+
+---
+name: xcnt_event_post_load_group
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr10
+ ; GCN-LABEL: name: xcnt_event_post_load_group
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+ ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+ ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 3
+ ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 2
+ ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 1
+ ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+ $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+ $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+ $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+ GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+...
+
+# The three V_MOV_B32 instructions waiting outside the group needs appropriate wait_xcnt
+# insertion as their dst registers have dependencies with instructions inside the group.
+
+---
+name: xcnt_event_post_store_group
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GCN-LABEL: name: xcnt_event_post_store_group
+ ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 8
+ ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 6
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 4
+ ; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 2, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr11 = V_LSHLREV_B32_e64 16, $vgpr10, implicit $exec
+ $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+ $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+ GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr7 = V_MOV_B32_e32 2, implicit $exec
+ $vgpr11 = V_LSHLREV_B32_e64 16, $vgpr10, implicit $exec
+...
+
+# This test captures the case that interleaving load store operations form separate groups.
+# The registers in V_MOV_B32 are all have dependency with these independent groups and
+# should have the wait_xcnt insertion with appropriate wait values.
+
+---
+name: load_store_switching
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GCN-LABEL: name: load_store_switching
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr5, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr7 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, $vgpr4, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr8 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr2_vgpr3, $vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 1
+ ; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 2
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr8 = V_MOV_B32_e32 2, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 3, implicit $exec
+ $vgpr0 = SCRATCH_LOAD_DWORD $vgpr5, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+ $vgpr7 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr2_vgpr3, $vgpr4, 0, 0, implicit $exec
+ $vgpr8 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+ GLOBAL_STORE_DWORD killed $vgpr2_vgpr3, $vgpr5, 0, 0, implicit $exec
+ $vgpr7 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr8 = V_MOV_B32_e32 2, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 3, implicit $exec
+...
+
+# V_DUAL_MOV is a single instruction and should emit required xcnt
+# if the destination registers have any memory-op dependency.
+
+---
+name: dual_mov
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $vgpr1
+ ; GCN-LABEL: name: dual_mov
+ ; GCN: liveins: $sgpr0, $sgpr1, $vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr2 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+...
+
+# No xcnt wait insertion for DS load/store operations.
+
+---
+name: ds_load_store
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: ds_load_store
+ ; GCN: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: $vgpr0 = DS_READ_B32_gfx9 killed $vgpr1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) undef`, addrspace 3)
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+ ; GCN-NEXT: S_WAIT_DSCNT 0
+ ; GCN-NEXT: DS_WRITE_B32_gfx9 killed $vgpr0, killed $vgpr1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) undef`, addrspace 3)
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr0 = DS_READ_B32_gfx9 killed $vgpr1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(3)* undef`)
+ $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+ DS_WRITE_B32_gfx9 killed $vgpr0, killed $vgpr1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(3)* undef`)
+ $vgpr0 = V_MOV_B32_e32 20, implicit $exec
+...
+
+---
+name: xcnt_max
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
+ ; GCN-LABEL: name: xcnt_max
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 62
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+...
More information about the llvm-commits
mailing list