[llvm] [AMDGPU][SIInsertWaitcnts] Track SCC. Insert KM_CNT waits for SCC writes. (PR #157843)
Petar Avramovic via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 10 05:25:24 PDT 2025
https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/157843
Add new event SCC_WRITE for s_barrier_signal_isfirst and s_barrier_leave,
instructions that write to SCC, counter is KM_CNT.
Also start tracking SCC for reads and writes.
s_barrier_wait on the same barrier guarantees that the SCC write from
s_barrier_signal_isfirst has landed, no need to insert s_wait_kmcnt.
>From 5f13830a64db61c8dba1f81d060e7ee2894b20ee Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Wed, 10 Sep 2025 14:23:37 +0200
Subject: [PATCH] [AMDGPU][SIInsertWaitcnts] Track SCC. Insert KM_CNT waits for
SCC writes.
Add new event SCC_WRITE for s_barrier_signal_isfirst and s_barrier_leave,
instructions that write to SCC, counter is KM_CNT.
Also start tracking SCC for reads and writes.
s_barrier_wait on the same barrier guarantees that the SCC write from
s_barrier_signal_isfirst has landed, no need to insert s_wait_kmcnt.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 80 +++++++-
.../llvm.amdgcn.s.barrier.signal.isfirst.ll | 75 +++++++-
.../llvm.amdgcn.s.barrier.signal.isfirst.mir | 105 +++++++++++
llvm/test/CodeGen/AMDGPU/s-barrier.ll | 1 +
.../waitcnt-kmcnt-scc-different-block.mir | 173 ++++++++++++++++++
5 files changed, 427 insertions(+), 7 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b163a274396ff..f2c12c32860a6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -121,6 +121,7 @@ struct HardwareLimits {
DECL(LDS_ACCESS) /* lds read & write */ \
DECL(GDS_ACCESS) /* gds read & write */ \
DECL(SQ_MESSAGE) /* send message */ \
+ DECL(SCC_WRITE) /* write to SCC from barrier */ \
DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
DECL(SMEM_GROUP) /* scalar-memory group */ \
DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
@@ -149,6 +150,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
+// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC
// We reserve a fixed number of VGPR slots in the scoring tables for
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping {
@@ -163,6 +165,9 @@ enum RegisterMapping {
FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
NUM_LDS_VGPRS = 9, // One more than the stores we track.
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
+ NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
+ // Remaining non-allocatable registers
+ SCC = NUM_ALL_ALLOCATABLE
};
// Enumerate different types of result-returning VMEM operations. Although
@@ -401,7 +406,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
eventMask({VMEM_SAMPLER_READ_ACCESS}),
eventMask({VMEM_BVH_READ_ACCESS}),
- eventMask({SMEM_ACCESS, SQ_MESSAGE}),
+ eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
eventMask({VMEM_GROUP, SMEM_GROUP})};
return WaitEventMaskForInstGFX12Plus;
@@ -586,6 +591,7 @@ class SIInsertWaitcnts {
WaitcntBrackets &ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
+ bool asynchronouslyWritesSCC(unsigned Opcode) const;
};
// This objects maintains the current score brackets of each wait counter, and
@@ -626,7 +632,12 @@ class WaitcntBrackets {
unsigned getRegScore(int GprNo, InstCounterType T) const {
if (GprNo < NUM_ALL_VGPRS)
return VgprScores[T][GprNo];
- return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+
+ if (GprNo < NUM_ALL_ALLOCATABLE)
+ return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+
+ assert(GprNo == SCC);
+ return SCCScore;
}
bool merge(const WaitcntBrackets &Other);
@@ -646,6 +657,7 @@ class WaitcntBrackets {
AMDGPU::Waitcnt &Wait) const {
determineWait(T, {RegNo, RegNo + 1}, Wait);
}
+ void tryClearSCCWriteEvent(MachineInstr *Inst);
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
@@ -785,6 +797,10 @@ class WaitcntBrackets {
// Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
// X_CNT score.
unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
+ // Reg score for SCC.
+ unsigned SCCScore = 0;
+ // The unique instruction that has an SCC write pending, if there is one.
+ const MachineInstr *PendingSCCWrite = nullptr;
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
// write to each vgpr.
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
@@ -820,6 +836,9 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
const MachineRegisterInfo *MRI,
const SIRegisterInfo *TRI,
const MachineOperand &Op) const {
+ if (Op.getReg() == AMDGPU::SCC)
+ return {SCC, SCC + 1};
+
if (!TRI->isInAllocatableClass(Op.getReg()))
return {-1, -1};
@@ -864,9 +883,11 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
if (RegNo < NUM_ALL_VGPRS) {
VgprUB = std::max(VgprUB, RegNo);
VgprScores[CntTy][RegNo] = Score;
- } else {
+ } else if (RegNo < NUM_ALL_ALLOCATABLE) {
SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
+ } else { // SCC
+ SCCScore = Score;
}
}
}
@@ -1077,6 +1098,11 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
if (Slot)
setRegScore(FIRST_LDS_VGPR, T, CurrScore);
}
+
+ if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
+ setRegScore(SCC, T, CurrScore);
+ PendingSCCWrite = &Inst;
+ }
}
}
@@ -1145,6 +1171,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
OS << RelScore << ":s" << J << " ";
}
}
+ if (T == KM_CNT && SCCScore > 0)
+ OS << SCCScore << ":SCC ";
}
OS << '\n';
}
@@ -1219,6 +1247,24 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
}
}
+void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
+ // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
+ // SCC has landed
+ if (PendingSCCWrite &&
+ PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
+ PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
+ unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
+ // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
+ if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
+ SCC_WRITE_PendingEvent) {
+ setScoreLB(KM_CNT, getScoreUB(KM_CNT));
+ }
+
+ PendingEvents &= ~SCC_WRITE_PendingEvent;
+ PendingSCCWrite = nullptr;
+ }
+}
+
void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
@@ -1908,6 +1954,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait);
}
}
+ } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
+ ScoreBrackets.tryClearSCCWriteEvent(&MI);
} else {
// FIXME: Should not be relying on memoperands.
// Look at the source operands of every instruction to see if
@@ -2003,6 +2051,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
}
ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
+ } else if (Op.getReg() == AMDGPU::SCC) {
+ ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
} else {
ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
}
@@ -2340,6 +2390,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
else
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
+ } else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
} else {
switch (Inst.getOpcode()) {
case AMDGPU::S_SENDMSG:
@@ -2350,9 +2402,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
break;
case AMDGPU::S_MEMTIME:
case AMDGPU::S_MEMREALTIME:
- case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
- case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
- case AMDGPU::S_BARRIER_LEAVE:
case AMDGPU::S_GET_BARRIER_STATE_M0:
case AMDGPU::S_GET_BARRIER_STATE_IMM:
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
@@ -2419,6 +2468,19 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
if (T == DS_CNT)
StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
+ if (T == KM_CNT) {
+ StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
+ if (Other.hasPendingEvent(SCC_WRITE)) {
+ unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
+ if (!OldEventsHasSCCWrite) {
+ PendingSCCWrite = Other.PendingSCCWrite;
+ } else {
+ if (PendingSCCWrite != Other.PendingSCCWrite)
+ PendingSCCWrite = nullptr;
+ }
+ }
+ }
+
for (int J = 0; J <= VgprUB; J++)
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
@@ -2450,6 +2512,12 @@ static bool isWaitInstr(MachineInstr &Inst) {
counterTypeForInstr(Opcode).has_value();
}
+bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) const {
+ return Opcode == AMDGPU::S_BARRIER_LEAVE ||
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
+}
+
// Generate s_waitcnt instructions where needed.
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineBasicBlock &Block,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
index 248e0c716b975..7ff13908eda4f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
@@ -12,10 +12,10 @@ define i1 @func1() {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: func1:
@@ -27,13 +27,86 @@ define i1 @func1() {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ ret i1 %r
+}
+
+define i1 @signal_isfirst_same_barrier_wait() {
+; GFX12-SDAG-LABEL: signal_isfirst_same_barrier_wait:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT: s_barrier_wait -1
+; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: signal_isfirst_same_barrier_wait:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT: s_barrier_wait -1
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ ret i1 %r
+}
+
+define i1 @signal_isfirst_different_barrier_wait() {
+; GFX12-SDAG-LABEL: signal_isfirst_different_barrier_wait:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT: s_barrier_wait 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: signal_isfirst_different_barrier_wait:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT: s_barrier_wait 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 0)
ret i1 %r
}
+declare void @llvm.amdgcn.s.barrier.wait(i16)
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.mir
new file mode 100644
index 0000000000000..3972553867ab9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.mir
@@ -0,0 +1,105 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: signal_isfirst_imm_same_barrier_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: signal_isfirst_imm_same_barrier_wait
+ ; GCN: S_WAIT_LOADCNT_DSCNT 0
+ ; GCN-NEXT: S_WAIT_EXPCNT 0
+ ; GCN-NEXT: S_WAIT_SAMPLECNT 0
+ ; GCN-NEXT: S_WAIT_BVHCNT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
+ ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ ; GCN-NEXT: S_BARRIER_WAIT -1
+ ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+ S_CMP_EQ_U32 0, 0, implicit-def $scc
+ S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ S_BARRIER_WAIT -1
+ renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+name: signal_isfirst_imm_different_barrier_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: signal_isfirst_imm_different_barrier_wait
+ ; GCN: S_WAIT_LOADCNT_DSCNT 0
+ ; GCN-NEXT: S_WAIT_EXPCNT 0
+ ; GCN-NEXT: S_WAIT_SAMPLECNT 0
+ ; GCN-NEXT: S_WAIT_BVHCNT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
+ ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ ; GCN-NEXT: S_BARRIER_WAIT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+ S_CMP_EQ_U32 0, 0, implicit-def $scc
+ S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ S_BARRIER_WAIT 0
+ renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+name: signal_isfirst_m0_same_barrier_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: signal_isfirst_m0_same_barrier_wait
+ ; GCN: S_WAIT_LOADCNT_DSCNT 0
+ ; GCN-NEXT: S_WAIT_EXPCNT 0
+ ; GCN-NEXT: S_WAIT_SAMPLECNT 0
+ ; GCN-NEXT: S_WAIT_BVHCNT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
+ ; GCN-NEXT: $m0 = S_MOV_B32 -1
+ ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
+ ; GCN-NEXT: S_BARRIER_WAIT -1
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+ S_CMP_EQ_U32 0, 0, implicit-def $scc
+ $m0 = S_MOV_B32 -1
+ S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
+ S_BARRIER_WAIT -1
+ renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+name: signal_isfirst_m0_different_barrier_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: signal_isfirst_m0_different_barrier_wait
+ ; GCN: S_WAIT_LOADCNT_DSCNT 0
+ ; GCN-NEXT: S_WAIT_EXPCNT 0
+ ; GCN-NEXT: S_WAIT_SAMPLECNT 0
+ ; GCN-NEXT: S_WAIT_BVHCNT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
+ ; GCN-NEXT: $m0 = S_MOV_B32 -1
+ ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
+ ; GCN-NEXT: S_BARRIER_WAIT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+ S_CMP_EQ_U32 0, 0, implicit-def $scc
+ $m0 = S_MOV_B32 -1
+ S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
+ S_BARRIER_WAIT 0
+ renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
index a4fa8e4b3c8e2..8a9beb73a6baa 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -155,6 +155,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_barrier_signal -1
; GFX12-GISEL-NEXT: s_barrier_join m0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_barrier_leave
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir
new file mode 100644
index 0000000000000..33085bfa2cc96
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir
@@ -0,0 +1,173 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s
+
+---
+name: scc_write_in_other_block
+body: |
+ ; GFX12-LABEL: name: scc_write_in_other_block
+ ; GFX12: bb.0:
+ ; GFX12-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
+ ; GFX12-NEXT: S_WAIT_EXPCNT 0
+ ; GFX12-NEXT: S_WAIT_SAMPLECNT 0
+ ; GFX12-NEXT: S_WAIT_BVHCNT 0
+ ; GFX12-NEXT: S_WAIT_KMCNT 0
+ ; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ ; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX12-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.1:
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
+ ; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: S_WAIT_KMCNT 0
+ ; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
+ ; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ bb.0:
+ S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+ bb.1:
+ renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
+ S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
+
+ bb.2:
+ renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
+ $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: scc_write_in_other_block_with_barrier_wait
+body: |
+ ; GFX12-LABEL: name: scc_write_in_other_block_with_barrier_wait
+ ; GFX12: bb.0:
+ ; GFX12-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
+ ; GFX12-NEXT: S_WAIT_EXPCNT 0
+ ; GFX12-NEXT: S_WAIT_SAMPLECNT 0
+ ; GFX12-NEXT: S_WAIT_BVHCNT 0
+ ; GFX12-NEXT: S_WAIT_KMCNT 0
+ ; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ ; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX12-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.1:
+ ; GFX12-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
+ ; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: S_BARRIER_WAIT -1
+ ; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
+ ; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ bb.0:
+ S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+ bb.1:
+ renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc
+ S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec
+
+ bb.2:
+ S_BARRIER_WAIT -1
+ renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
+ $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: scc_write_in_multiple_blocks_with_barrier_wait
+body: |
+ ; GFX12-LABEL: name: scc_write_in_multiple_blocks_with_barrier_wait
+ ; GFX12: bb.0:
+ ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
+ ; GFX12-NEXT: S_WAIT_EXPCNT 0
+ ; GFX12-NEXT: S_WAIT_SAMPLECNT 0
+ ; GFX12-NEXT: S_WAIT_BVHCNT 0
+ ; GFX12-NEXT: S_WAIT_KMCNT 0
+ ; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ ; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX12-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.1:
+ ; GFX12-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr1, implicit-def $exec, implicit $exec
+ ; GFX12-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.2:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM 0, implicit-def $scc, implicit killed $scc
+ ; GFX12-NEXT: S_BRANCH %bb.5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.3:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM 1, implicit-def $scc, implicit killed $scc
+ ; GFX12-NEXT: S_BRANCH %bb.5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.4:
+ ; GFX12-NEXT: successors: %bb.5(0x80000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: bb.5:
+ ; GFX12-NEXT: S_BARRIER_WAIT -1
+ ; GFX12-NEXT: S_WAIT_KMCNT 0
+ ; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
+ ; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ bb.0:
+ S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ S_CBRANCH_EXECZ %bb.4, implicit $exec
+
+ bb.1:
+ V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr1, implicit-def $exec, implicit $exec
+ S_CBRANCH_EXECZ %bb.3, implicit $exec
+
+ bb.2:
+ S_BARRIER_SIGNAL_ISFIRST_IMM 0, implicit-def $scc, implicit killed $scc
+ S_BRANCH %bb.5
+
+ bb.3:
+ S_BARRIER_SIGNAL_ISFIRST_IMM 1, implicit-def $scc, implicit killed $scc
+ S_BRANCH %bb.5
+
+ bb.4:
+ S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+
+ bb.5:
+ S_BARRIER_WAIT -1
+ renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc
+ $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
More information about the llvm-commits
mailing list