[llvm] [AMDGPU] Fix Xcnt handling between blocks (PR #165201)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 29 01:20:20 PDT 2025
https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/165201
>From 8b1256954c52d52a71edc0127b242eacf8298697 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Tue, 28 Oct 2025 10:57:16 +0530
Subject: [PATCH 1/2] [AMDGPU] Fix Xcnt handling between blocks
The compiler needs to conservatively flush the
Xcnt Counter on entry to a block in case of
pending SMEM and VMEM events.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 42 +++++++++++++------
.../AMDGPU/flat-load-saddr-to-vaddr.ll | 5 +++
llvm/test/CodeGen/AMDGPU/mul.ll | 5 +++
llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 2 +-
4 files changed, 40 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6dcbced010a5a..11ff1bdb9e2a1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -565,12 +565,12 @@ class SIInsertWaitcnts {
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr,
- bool FlushVmCnt);
+ MachineInstr *OldWaitcntInstr, bool FlushVmCnt,
+ bool FlushXCnt);
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr);
+ MachineInstr *OldWaitcntInstr, bool FlushXCnt);
void updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets);
bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -1841,12 +1841,13 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
/// and if so what the value of each counter is.
/// The "score bracket" is bound by the lower bound and upper bound
/// scores (*_score_LB and *_score_ub respectively).
-/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
-/// flush the vmcnt counter here.
+/// If FlushVmCnt/FlushXcnt is true, that means that we want to
+/// generate a s_waitcnt to flush the vmcnt/xcnt counter here.
bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr,
- bool FlushVmCnt) {
+ bool FlushVmCnt,
+ bool FlushXCnt) {
setForceEmitWaitcnt();
assert(!MI.isMetaInstruction());
@@ -2101,18 +2102,26 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.BvhCnt = 0;
}
+ // Conservatively flush the Xcnt Counter at the start of the block.
+ if (FlushXCnt) {
+ if (ScoreBrackets.hasPendingEvent(SMEM_GROUP) &&
+ ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+ Wait.XCnt = 0;
+ }
+
if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
Wait.LoadCnt = 0;
return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
- OldWaitcntInstr);
+ OldWaitcntInstr, FlushXCnt);
}
bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr) {
+ MachineInstr *OldWaitcntInstr,
+ bool FlushXCnt) {
bool Modified = false;
if (OldWaitcntInstr)
@@ -2141,7 +2150,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
}
// XCnt may be already consumed by a load wait.
- if (Wait.XCnt != ~0u) {
+ // If we need to flush the Xcnt counter, don't
+ // combine it with any other wait events.
+ if (Wait.XCnt != ~0u && !FlushXCnt) {
if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
Wait.XCnt = ~0u;
@@ -2213,8 +2224,9 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
ScoreBrackets.simplifyWaitcnt(Wait);
auto SuccessorIt = std::next(Inst.getIterator());
- bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
- /*OldWaitcntInstr=*/nullptr);
+ bool Result =
+ generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
+ /*OldWaitcntInstr=*/nullptr, /*FlushXCnt=*/false);
if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
@@ -2454,6 +2466,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
+ bool FirstInstInBlock = true;
for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
E = Block.instr_end();
@@ -2475,10 +2488,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
isPreheaderToFlush(Block, ScoreBrackets);
+ bool FlushXCnt = FirstInstInBlock;
+ if (FirstInstInBlock)
+ FirstInstInBlock = false;
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
- FlushVmCnt);
+ FlushVmCnt, FlushXCnt);
OldWaitcntInstr = nullptr;
// Restore vccz if it's not known to be correct already.
@@ -2567,7 +2583,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Combine or remove any redundant waitcnts at the end of the block.
Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
- OldWaitcntInstr);
+ OldWaitcntInstr, /*FlushXCnt=*/false);
LLVM_DEBUG({
dbgs() << "*** End Block: ";
diff --git a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
index e8efa859ce13f..564b2a8af338c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
@@ -10,6 +10,10 @@
; Check that we are changing SADDR form of a load to VADDR and do not have to use
; readfirstlane instructions to move address from VGPRs into SGPRs.
+; FIXME: Redundant xcnt in the loop header.
+; Pending xcnt events should check if they can be folded into soft waitcnts
+; before being propogated.
+
define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg1, ptr nocapture %arg2) {
; GCN-LABEL: test_move_load_address_to_vgpr:
; GCN: ; %bb.0: ; %bb
@@ -24,6 +28,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocap
; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GCN-NEXT: .LBB0_1: ; %bb3
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: s_wait_dscnt 0x0
; GCN-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index d29847e40dc8b..b9797cda5fa57 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2583,8 +2583,13 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX1250-NEXT: s_branch .LBB15_6
; GFX1250-NEXT: .LBB15_5:
; GFX1250-NEXT: v_mov_b32_e32 v0, s7
+; TODO: the xcnt can actually be merged into the loadcnt at the bottom.
+; ifinstead of checking at the first instruction in the block,
+; if i am able to check at the instruction which needs an xcnt,
+; if it had a pending smem & vmem, then this extra xcnt can be avoided.
; GFX1250-NEXT: .LBB15_6: ; %endif
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
; GFX1250-NEXT: s_mov_b32 s2, -1
; GFX1250-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index 1b8e126f19ae1..2a80de849aec7 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -945,7 +945,6 @@ body: |
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
---
name: wait_kmcnt_with_outstanding_vmem_2
tracksRegLiveness: true
@@ -970,6 +969,7 @@ body: |
; GCN-NEXT: liveins: $sgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
bb.0:
>From 7aac631cf9a8ac6ed5ec834290e5adda4a5110f8 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Wed, 29 Oct 2025 13:36:09 +0530
Subject: [PATCH 2/2] Second attempt. This seems much cleaner than flushing the
xcnt counter.
TODO: add more test cases.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 68 ++++++++++---------
.../AMDGPU/flat-load-saddr-to-vaddr.ll | 5 --
llvm/test/CodeGen/AMDGPU/mul.ll | 5 --
llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 2 +-
4 files changed, 36 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 11ff1bdb9e2a1..15c1a4950a939 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -565,12 +565,12 @@ class SIInsertWaitcnts {
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr, bool FlushVmCnt,
- bool FlushXCnt);
+ MachineInstr *OldWaitcntInstr,
+ bool FlushVmCnt);
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr, bool FlushXCnt);
+ MachineInstr *OldWaitcntInstr);
void updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets);
bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -1291,15 +1291,33 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
// Wait on XCNT is redundant if we are already waiting for a load to complete.
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
// zero.
- if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
- return applyWaitcnt(X_CNT, 0);
+ if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
+ bool flag = hasPendingEvent(VMEM_GROUP);
+ unsigned lb = getScoreLB(X_CNT);
+ applyWaitcnt(X_CNT, 0);
+ if (flag) {
+ // Pendingevents mai i need to reactivate the VMEM_GROUP
+ PendingEvents |= (1 << VMEM_GROUP);
+ setScoreLB(X_CNT, lb);
+ }
+ // return applyWaitcnt(X_CNT, 0);
+ return;
+ }
// If we have pending store we cannot optimize XCnt because we do not wait for
// stores. VMEM loads retun in order, so if we only have loads XCnt is
// decremented to the same number as LOADCnt.
if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
- !hasPendingEvent(STORE_CNT))
- return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ !hasPendingEvent(STORE_CNT)) {
+ unsigned lb = getScoreLB(X_CNT);
+ bool flag = hasPendingEvent(SMEM_GROUP);
+ applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ if (flag) {
+ PendingEvents |= (1 << SMEM_GROUP);
+ setScoreLB(X_CNT, lb);
+ }
+ return;
+ }
applyWaitcnt(X_CNT, Wait.XCnt);
}
@@ -1841,13 +1859,12 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
/// and if so what the value of each counter is.
/// The "score bracket" is bound by the lower bound and upper bound
/// scores (*_score_LB and *_score_ub respectively).
-/// If FlushVmCnt/FlushXcnt is true, that means that we want to
-/// generate a s_waitcnt to flush the vmcnt/xcnt counter here.
+/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
+/// flush the vmcnt counter here.
bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr,
- bool FlushVmCnt,
- bool FlushXCnt) {
+ bool FlushVmCnt) {
setForceEmitWaitcnt();
assert(!MI.isMetaInstruction());
@@ -2102,26 +2119,18 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.BvhCnt = 0;
}
- // Conservatively flush the Xcnt Counter at the start of the block.
- if (FlushXCnt) {
- if (ScoreBrackets.hasPendingEvent(SMEM_GROUP) &&
- ScoreBrackets.hasPendingEvent(VMEM_GROUP))
- Wait.XCnt = 0;
- }
-
if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
Wait.LoadCnt = 0;
return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
- OldWaitcntInstr, FlushXCnt);
+ OldWaitcntInstr);
}
bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr,
- bool FlushXCnt) {
+ MachineInstr *OldWaitcntInstr) {
bool Modified = false;
if (OldWaitcntInstr)
@@ -2150,9 +2159,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
}
// XCnt may be already consumed by a load wait.
- // If we need to flush the Xcnt counter, don't
- // combine it with any other wait events.
- if (Wait.XCnt != ~0u && !FlushXCnt) {
+ if (Wait.XCnt != ~0u) {
if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
Wait.XCnt = ~0u;
@@ -2224,9 +2231,8 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
ScoreBrackets.simplifyWaitcnt(Wait);
auto SuccessorIt = std::next(Inst.getIterator());
- bool Result =
- generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
- /*OldWaitcntInstr=*/nullptr, /*FlushXCnt=*/false);
+ bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
+ /*OldWaitcntInstr=*/nullptr);
if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
@@ -2466,7 +2472,6 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
- bool FirstInstInBlock = true;
for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
E = Block.instr_end();
@@ -2488,13 +2493,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
isPreheaderToFlush(Block, ScoreBrackets);
- bool FlushXCnt = FirstInstInBlock;
- if (FirstInstInBlock)
- FirstInstInBlock = false;
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
- FlushVmCnt, FlushXCnt);
+ FlushVmCnt);
OldWaitcntInstr = nullptr;
// Restore vccz if it's not known to be correct already.
@@ -2583,7 +2585,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Combine or remove any redundant waitcnts at the end of the block.
Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
- OldWaitcntInstr, /*FlushXCnt=*/false);
+ OldWaitcntInstr);
LLVM_DEBUG({
dbgs() << "*** End Block: ";
diff --git a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
index 564b2a8af338c..e8efa859ce13f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
@@ -10,10 +10,6 @@
; Check that we are changing SADDR form of a load to VADDR and do not have to use
; readfirstlane instructions to move address from VGPRs into SGPRs.
-; FIXME: Redundant xcnt in the loop header.
-; Pending xcnt events should check if they can be folded into soft waitcnts
-; before being propogated.
-
define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg1, ptr nocapture %arg2) {
; GCN-LABEL: test_move_load_address_to_vgpr:
; GCN: ; %bb.0: ; %bb
@@ -28,7 +24,6 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocap
; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GCN-NEXT: .LBB0_1: ; %bb3
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: s_wait_dscnt 0x0
; GCN-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index b9797cda5fa57..d29847e40dc8b 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2583,13 +2583,8 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX1250-NEXT: s_branch .LBB15_6
; GFX1250-NEXT: .LBB15_5:
; GFX1250-NEXT: v_mov_b32_e32 v0, s7
-; TODO: the xcnt can actually be merged into the loadcnt at the bottom.
-; ifinstead of checking at the first instruction in the block,
-; if i am able to check at the instruction which needs an xcnt,
-; if it had a pending smem & vmem, then this extra xcnt can be avoided.
; GFX1250-NEXT: .LBB15_6: ; %endif
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
; GFX1250-NEXT: s_mov_b32 s2, -1
; GFX1250-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index 2a80de849aec7..dcb7388f392a6 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -969,8 +969,8 @@ body: |
; GCN-NEXT: liveins: $sgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAIT_KMCNT 0
- ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
bb.0:
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
More information about the llvm-commits
mailing list