[llvm] f5199d7 - [AMDGPU] Revise handling of preexisting waitcnt
Austin Kerbow via llvm-commits
llvm-commits at lists.llvm.org
Wed May 5 17:21:46 PDT 2021
Author: Austin Kerbow
Date: 2021-05-05T17:21:33-07:00
New Revision: f5199d7ae0edf47c41d7347aad18f893ad20f0b3
URL: https://github.com/llvm/llvm-project/commit/f5199d7ae0edf47c41d7347aad18f893ad20f0b3
DIFF: https://github.com/llvm/llvm-project/commit/f5199d7ae0edf47c41d7347aad18f893ad20f0b3.diff
LOG: [AMDGPU] Revise handling of preexisting waitcnt
Preexisting waitcnt may not update the scoreboard if the instruction
being examined needed to wait on fewer counters than what was encoded in
the old waitcnt instruction. Fixing this results in the elimination of
some redudnat waitcnt.
These changes also enable combining consecutive waitcnt into a single
S_WAITCNT or S_WAITCNT_VSCNT instruction.
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D100281
Added:
llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
Modified:
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
llvm/test/CodeGen/AMDGPU/flat-scratch.ll
llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
llvm/test/CodeGen/AMDGPU/waitcnt-debug.mir
llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ffc651f3b86eb..204e392517836 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -245,8 +245,8 @@ class WaitcntBrackets {
const SIRegisterInfo *TRI, unsigned OpNo) const;
bool counterOutOfOrder(InstCounterType T) const;
- bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
- bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+ void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+ void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
void determineWait(InstCounterType T, unsigned ScoreToWait,
AMDGPU::Waitcnt &Wait) const;
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
@@ -418,7 +418,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
}
if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
- DebugCounter::shouldExecute(ForceLgkmCounter)) {
+ DebugCounter::shouldExecute(ForceLgkmCounter)) {
ForceEmitWaitcnt[LGKM_CNT] = true;
} else {
ForceEmitWaitcnt[LGKM_CNT] = false;
@@ -442,6 +442,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
WaitcntBrackets *ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
+ bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr,
+ AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
};
} // end anonymous namespace
@@ -708,22 +711,23 @@ void WaitcntBrackets::print(raw_ostream &OS) {
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
/// whether a waitcnt instruction is needed at all.
-bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
- return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
- simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
- simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
- simplifyWaitcnt(VS_CNT, Wait.VsCnt);
+void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+ simplifyWaitcnt(VM_CNT, Wait.VmCnt);
+ simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
+ simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+ simplifyWaitcnt(VS_CNT, Wait.VsCnt);
}
-bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
+void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
unsigned &Count) const {
const unsigned LB = getScoreLB(T);
const unsigned UB = getScoreUB(T);
- if (Count < UB && UB - Count > LB)
- return true;
- Count = ~0u;
- return false;
+ // The number of outstanding events for this type, T, can be calculated
+ // as (UB - LB). If the current Count is greater than or equal to the number
+ // of outstanding events, then the wait for this counter is redundant.
+ if (Count >= UB - LB)
+ Count = ~0u;
}
void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
@@ -798,6 +802,107 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() {
return new SIInsertWaitcnts();
}
+/// Combine consecutive waitcnt instructions that precede \p MI and follow
+/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
+/// by previous passes. Currently this pass conservatively assumes that these
+/// preexisting waitcnt are required for correctness.
+bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr,
+ AMDGPU::Waitcnt &Wait,
+ const MachineInstr *MI) {
+ bool Modified = false;
+ MachineInstr *WaitcntInstr = nullptr;
+ MachineInstr *WaitcntVsCntInstr = nullptr;
+ for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
+ &*II != MI; II = NextI, ++NextI) {
+ if (II->isMetaInstruction())
+ continue;
+
+ if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+ // Conservatively update required wait if this waitcnt was added in an
+ // earlier pass. In this case it will not exist in the tracked waitcnt
+ // set.
+ if (!TrackedWaitcntSet.count(&*II)) {
+ unsigned IEnc = II->getOperand(0).getImm();
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+ Wait = Wait.combined(OldWait);
+ }
+
+ // Merge consecutive waitcnt of the same type by erasing multiples.
+ if (!WaitcntInstr) {
+ WaitcntInstr = &*II;
+ } else {
+ II->eraseFromParent();
+ Modified = true;
+ }
+
+ } else {
+ assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ if (!TrackedWaitcntSet.count(&*II)) {
+ unsigned OldVSCnt =
+ TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
+ Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
+ }
+
+ if (!WaitcntVsCntInstr) {
+ WaitcntVsCntInstr = &*II;
+ } else {
+ II->eraseFromParent();
+ Modified = true;
+ }
+ }
+ }
+
+ // Updated encoding of merged waitcnt with the required wait.
+ if (WaitcntInstr) {
+ if (Wait.hasWaitExceptVsCnt()) {
+ unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
+ unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
+ if (OldEnc != NewEnc) {
+ WaitcntInstr->getOperand(0).setImm(NewEnc);
+ Modified = true;
+ }
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VmCnt = ~0u;
+ Wait.LgkmCnt = ~0u;
+ Wait.ExpCnt = ~0u;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+ << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
+ << '\n');
+ } else {
+ WaitcntInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ if (WaitcntVsCntInstr) {
+ if (Wait.hasWaitVsCnt()) {
+ assert(ST->hasVscnt());
+ unsigned OldVSCnt =
+ TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
+ ->getImm();
+ if (Wait.VsCnt != OldVSCnt) {
+ TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
+ ->setImm(Wait.VsCnt);
+ Modified = true;
+ }
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VsCnt = ~0u;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+ << "Old Instr: " << MI
+ << "New Instr: " << *WaitcntVsCntInstr << '\n');
+ } else {
+ WaitcntVsCntInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ return Modified;
+}
+
static bool readsVCCZ(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
@@ -833,12 +938,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr) {
setForceEmitWaitcnt();
- bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
if (MI.isMetaInstruction())
return false;
AMDGPU::Waitcnt Wait;
+ bool Modified = false;
// See if this instruction has a forced S_WAITCNT VM.
// TODO: Handle other cases of NeedsWaitcntVmBefore()
@@ -1053,32 +1158,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
}
}
- // Early-out if no wait is indicated.
- if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
- bool Modified = false;
- if (OldWaitcntInstr) {
- for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
- &*II != &MI; II = NextI, ++NextI) {
- if (II->isDebugInstr())
- continue;
-
- if (TrackedWaitcntSet.count(&*II)) {
- TrackedWaitcntSet.erase(&*II);
- II->eraseFromParent();
- Modified = true;
- } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
- int64_t Imm = II->getOperand(0).getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
- } else {
- assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
- assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W));
- }
- }
- }
- return Modified;
- }
+ // Verify that the wait is actually needed.
+ ScoreBrackets.simplifyWaitcnt(Wait);
if (ForceEmitZeroWaitcnts)
Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
@@ -1092,57 +1173,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
if (ForceEmitWaitcnt[VS_CNT])
Wait.VsCnt = 0;
- ScoreBrackets.applyWaitcnt(Wait);
-
- AMDGPU::Waitcnt OldWait;
- bool Modified = false;
-
if (OldWaitcntInstr) {
- for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
- &*II != &MI; II = NextI, NextI++) {
- if (II->isDebugInstr())
- continue;
-
- if (II->getOpcode() == AMDGPU::S_WAITCNT) {
- unsigned IEnc = II->getOperand(0).getImm();
- AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
- OldWait = OldWait.combined(IWait);
- if (!TrackedWaitcntSet.count(&*II))
- Wait = Wait.combined(IWait);
- unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
- if (IEnc != NewEnc) {
- II->getOperand(0).setImm(NewEnc);
- Modified = true;
- }
- Wait.VmCnt = ~0u;
- Wait.LgkmCnt = ~0u;
- Wait.ExpCnt = ~0u;
- } else {
- assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
- assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
-
- unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)
- ->getImm();
- OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
- if (!TrackedWaitcntSet.count(&*II))
- Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
- if (Wait.VsCnt != ICnt) {
- TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt);
- Modified = true;
- }
- Wait.VsCnt = ~0u;
- }
-
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI
- << "New Instr: " << *II << '\n');
-
- if (!Wait.hasWait())
- return Modified;
- }
+ // Try to merge the required wait with preexisting waitcnt instructions.
+ // Also erase redundant waitcnt.
+ Modified =
+ applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
+ } else {
+ // Update waitcnt brackets after determining the required wait.
+ ScoreBrackets.applyWaitcnt(Wait);
}
- if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
+ // Build new waitcnt instructions unless no wait is needed or the old waitcnt
+ // instruction was modified to handle the required wait.
+ if (Wait.hasWaitExceptVsCnt()) {
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
@@ -1155,7 +1198,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
<< "New Instr: " << *SWaitInst << '\n');
}
- if (Wait.VsCnt != ~0u) {
+ if (Wait.hasWaitVsCnt()) {
assert(ST->hasVscnt());
auto SWaitInst =
@@ -1430,7 +1473,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
Iter != E;) {
MachineInstr &Inst = *Iter;
- // Track pre-existing waitcnts from earlier iterations.
+ // Track pre-existing waitcnts that were added in earlier iterations or by
+ // the memory legalizer.
if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
(Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
Inst.getOperand(0).isReg() &&
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 670375a9db9fb..a60c0847cb879 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -477,6 +477,14 @@ struct Waitcnt {
return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;
}
+ bool hasWaitExceptVsCnt() const {
+ return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u;
+ }
+
+ bool hasWaitVsCnt() const {
+ return VsCnt != ~0u;
+ }
+
bool dominates(const Waitcnt &Other) const {
return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 794fa1935e58e..230a97a917dfa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -184,7 +184,6 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: s_add_u32 s1, 0x104, s1
; GFX9-NEXT: scratch_store_dword off, v0, s1
@@ -357,7 +356,6 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: s_add_u32 s1, 0x4004, s1
; GFX9-NEXT: scratch_store_dword off, v0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
index 774270ea087fa..4b5c807a937c4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
@@ -72,7 +72,6 @@ define i32 @atomic_nand_i32_flat(i32* %ptr) nounwind {
; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index aa659e866fd70..4e7c299639c7d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -788,7 +788,6 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: s_add_u32 s1, 0x104, s1
; GFX9-NEXT: scratch_store_dword off, v0, s1
@@ -1419,7 +1418,6 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: s_add_u32 s1, 0x4004, s1
; GFX9-NEXT: scratch_store_dword off, v0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index d5348038f3c3d..055a782ec055d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -630,7 +630,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(double* %ptr) #1 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
@@ -706,7 +705,6 @@ define double @flat_atomic_fadd_f64_rtn_pat(double* %ptr) #1 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
@@ -731,7 +729,6 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(double* %ptr) #1 {
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index d2727faa1f603..b3d52793fec04 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -192,7 +192,6 @@ define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)*
; NOLOOP: s_mov_b32 m0, 0{{$}}
; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; NOLOOP-NEXT: load_dword
define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
@@ -220,7 +219,6 @@ define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
; NOLOOP: s_mov_b32 m0, 0
; NOLOOP: ds_gws_init v0 offset:7 gds
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; NOLOOP-NEXT: ds_gws_barrier v0 offset:7 gds
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 5a1114c7e7e3d..f3fedab1ff544 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -188,7 +188,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
@@ -204,7 +203,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -220,7 +218,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -248,7 +245,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -285,7 +281,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
@@ -303,7 +298,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -321,7 +315,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -351,7 +344,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -1121,7 +1113,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -1138,7 +1129,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1155,7 +1145,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1182,7 +1171,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -1218,7 +1206,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -1237,7 +1224,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1256,7 +1242,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1285,7 +1270,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -1322,7 +1306,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -1341,7 +1324,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1360,7 +1342,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1389,7 +1370,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -2456,7 +2436,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -2478,7 +2457,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2500,7 +2478,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -2532,7 +2509,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -2575,7 +2551,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -2599,7 +2574,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2623,7 +2597,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -2657,7 +2630,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -2701,7 +2673,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -2725,7 +2696,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2749,7 +2719,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -2783,7 +2752,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -2826,7 +2794,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -2848,7 +2815,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2870,7 +2836,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -2902,7 +2867,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -2945,7 +2909,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -2969,7 +2932,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2993,7 +2955,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -3027,7 +2988,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -3071,7 +3031,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -3095,7 +3054,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -3119,7 +3077,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -3153,7 +3110,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -3197,7 +3153,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -3221,7 +3176,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -3245,7 +3199,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -3279,7 +3232,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -3323,7 +3275,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -3347,7 +3298,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -3371,7 +3321,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -3405,7 +3354,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 418e2819ea6bd..1abe2dc3b5c6c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -188,7 +188,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
@@ -204,7 +203,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -220,7 +218,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -248,7 +245,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -285,7 +281,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
@@ -303,7 +298,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -321,7 +315,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -351,7 +344,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -1121,7 +1113,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -1138,7 +1129,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1155,7 +1145,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1182,7 +1171,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -1218,7 +1206,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -1237,7 +1224,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1256,7 +1242,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1285,7 +1270,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -1322,7 +1306,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -1341,7 +1324,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1360,7 +1342,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -1389,7 +1370,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -2456,7 +2436,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -2478,7 +2457,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2500,7 +2478,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -2532,7 +2509,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -2575,7 +2551,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -2599,7 +2574,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2623,7 +2597,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -2657,7 +2630,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -2701,7 +2673,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -2725,7 +2696,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2749,7 +2719,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -2783,7 +2752,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -2826,7 +2794,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -2848,7 +2815,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2870,7 +2836,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -2902,7 +2867,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -2945,7 +2909,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -2969,7 +2932,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2993,7 +2955,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -3027,7 +2988,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -3071,7 +3031,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -3095,7 +3054,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -3119,7 +3077,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -3153,7 +3110,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -3197,7 +3153,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -3221,7 +3176,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -3245,7 +3199,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -3279,7 +3232,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@@ -3323,7 +3275,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@@ -3347,7 +3298,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -3371,7 +3321,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@@ -3405,7 +3354,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 36208056c23e5..0cd6ea793d88e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -298,7 +298,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index f29b5abaf0998..f049cbc40934c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -202,7 +202,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -297,7 +296,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1103,7 +1101,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1194,7 +1191,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1289,7 +1285,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2363,7 +2358,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2479,7 +2473,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2599,7 +2592,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2716,7 +2708,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2832,7 +2823,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2952,7 +2942,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -3072,7 +3061,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@@ -3192,7 +3180,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-debug.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-debug.mir
index d8aa86a0582ef..641a31ab9b0f0 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-debug.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-debug.mir
@@ -25,6 +25,7 @@
# VM-NEXT: S_NOP 0
# ZERO: S_WAITCNT 0
+# ZERO-NEXT: S_NOP 0
# ZERO-NEXT: S_WAITCNT 0
# ZERO-NEXT: S_NOP 0
# ZERO-NEXT: S_WAITCNT 0
@@ -32,6 +33,8 @@
name: waitcnt-debug
liveins:
+machineFunctionInfo:
+ isEntryFunction: true
body: |
bb.0:
S_NOP 0
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
new file mode 100644
index 0000000000000..5601d69317053
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
@@ -0,0 +1,131 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s
+
+---
+name: test_waitcnt_preexisting_vscnt_unmodified
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_unmodified
+ ; GFX10: S_WAITCNT 0
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: S_BARRIER
+ ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX10: S_WAITCNT 112
+ ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX10: S_ENDPGM 0
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ S_WAITCNT_VSCNT undef $sgpr_null, 0
+ S_BARRIER
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: test_waitcnt_preexisting_vscnt_needs_vscnt
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_needs_vscnt
+ ; GFX10: S_WAITCNT 0
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: S_BARRIER
+ ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX10: S_WAITCNT 112
+ ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX10: S_ENDPGM 0
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ S_WAITCNT_VSCNT undef $sgpr_null, 1
+ S_BARRIER
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: test_waitcnt_preexisting_vscnt_with_other_waitcnt
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_with_other_waitcnt
+ ; GFX10: S_WAITCNT 0
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GFX10: S_WAITCNT 112
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: S_BARRIER
+ ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX10: S_WAITCNT 112
+ ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX10: S_ENDPGM 0
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ S_WAITCNT 112
+ S_WAITCNT_VSCNT undef $sgpr_null, 0
+ S_BARRIER
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: test_waitcnt_preexisting_vscnt_combined
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined
+ ; GFX10: S_WAITCNT 0
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: S_BARRIER
+ ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX10: S_WAITCNT 112
+ ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX10: S_ENDPGM 0
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ S_WAITCNT_VSCNT undef $sgpr_null, 0
+ S_WAITCNT_VSCNT undef $sgpr_null, 1
+ S_WAITCNT_VSCNT undef $sgpr_null, 2
+ S_BARRIER
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: test_waitcnt_preexisting_vscnt_combined_both_types
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined_both_types
+ ; GFX10: S_WAITCNT 0
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ ; GFX10: S_WAITCNT 0
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: S_BARRIER
+ ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX10: S_WAITCNT 112
+ ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX10: S_ENDPGM 0
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+ S_WAITCNT 0
+ S_WAITCNT_VSCNT undef $sgpr_null, 1
+ S_WAITCNT 0
+ S_WAITCNT_VSCNT undef $sgpr_null, 2
+ S_WAITCNT 0
+ S_BARRIER
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
index 5246232fabd1b..a79187a4e387a 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
@@ -1,37 +1,195 @@
-# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN %s
-
-# GCN-LABEL: name: test{{$}}
-# GCN: S_WAITCNT -16257
-# GCN: DS_READ2_B32
-# GCN: DS_READ2_B32
-# GCN: S_WAITCNT 383{{$}}
-# GCN-NEXT: $vgpr1 = V_OR_B32_e32 1, killed $vgpr1, implicit $exec
-# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
-# GCN-NEXT: S_WAITCNT 127{{$}}
-# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec
---- |
- define amdgpu_cs void @test() {
- ret void
- }
-...
----
-name: test
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr1, $vgpr0
-
- renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 480, 0
- renamable $vgpr13 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec
- S_WAITCNT -16257
- renamable $vgpr0_vgpr1 = DS_READ2_B32 renamable $vgpr13, 0, 1, 0, implicit $m0, implicit $exec
- renamable $vgpr2_vgpr3 = DS_READ2_B32 renamable $vgpr13, 2, 3, 0, implicit $m0, implicit $exec
- renamable $vgpr1 = V_OR_B32_e32 1, killed $vgpr1, implicit $exec
- renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
- renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec
- renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr3, killed $vgpr1, implicit $exec
- $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
- $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
- $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
- IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9 %s
+
+---
+name: test_waitcnt_preexisting_lgkmcnt_unmodified
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GFX9-LABEL: name: test_waitcnt_preexisting_lgkmcnt_unmodified
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
+ ; GFX9: S_WAITCNT 49279
+ ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_WAITCNT 112
+ ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_ENDPGM 0
+ $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
+ S_WAITCNT 49279
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: test_waitcnt_preexisting_vmcnt_unmodified
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; GFX9-LABEL: name: test_waitcnt_preexisting_vmcnt_unmodified
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX9: S_WAITCNT 3952
+ ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_WAITCNT 112
+ ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_ENDPGM 0
+ $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+# Respect preexisting waitcnt and add required wait.
+
+---
+name: test_waitcnt_preexisting_vmcnt_needs_lgkmcnt
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GFX9-LABEL: name: test_waitcnt_preexisting_vmcnt_needs_lgkmcnt
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
+ ; GFX9: S_WAITCNT 112
+ ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_WAITCNT 112
+ ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_ENDPGM 0
+ $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
+ S_WAITCNT 3952
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: test_waitcnt_preexisting_lgkmcnt_needs_vmcnt
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; GFX9-LABEL: name: test_waitcnt_preexisting_lgkmcnt_needs_vmcnt
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX9: S_WAITCNT 112
+ ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_WAITCNT 112
+ ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_ENDPGM 0
+ $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 49279
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+# Apply wait for all counters from preexisting waitcnt regardless of the wait
+# required by the next instruction.
+
+---
+name: test_waitcnt_preexisting_apply_all_counters
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_waitcnt_preexisting_apply_all_counters
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX9: $vgpr6_vgpr7 = DS_READ2_B32 $vgpr2, 0, 1, 0, implicit $m0, implicit $exec
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr6 = V_OR_B32_e32 1, killed $vgpr6, implicit $exec
+ ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_WAITCNT 112
+ ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr6_vgpr7 = DS_READ2_B32 $vgpr2, 0, 1, 0, implicit $m0, implicit $exec
+ S_WAITCNT 0
+ $vgpr6 = V_OR_B32_e32 1, killed $vgpr6, implicit $exec
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: test_waitcnt_preexisting_combine_waitcnt
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; GFX9-LABEL: name: test_waitcnt_preexisting_combine_waitcnt
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ S_WAITCNT 0
+ S_WAITCNT 0
+ S_WAITCNT 0
+ S_WAITCNT 0
+ S_WAITCNT 0
+ S_WAITCNT 0
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: test_waitcnt_preexisting_combine_waitcnt_
diff _counters
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; GFX9-LABEL: name: test_waitcnt_preexisting_combine_waitcnt_
diff _counters
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_WAITCNT 112
+ ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ S_WAITCNT 49279
+ S_WAITCNT 3952
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+...
+
+# Apply preexisting waitcnt when no wait is immediately needed.
+# FIXME: Move waitcnt as late as possible.
+
+---
+name: test_waitcnt_preexisting_early_wait
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; GFX9-LABEL: name: test_waitcnt_preexisting_early_wait
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: S_NOP 0
+ ; GFX9: S_NOP 0
+ ; GFX9: S_NOP 0
+ ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_ENDPGM 0
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ S_WAITCNT 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
+
+---
+name: test_waitcnt_preexisting_ignore_kill
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; GFX9-LABEL: name: test_waitcnt_preexisting_ignore_kill
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9: S_WAITCNT 3952
+ ; GFX9: KILL $vgpr0
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ S_WAITCNT 3952
+ KILL $vgpr0
+...
More information about the llvm-commits
mailing list