[llvm] [AMDGPU] Remove redundant S_WAIT_XCNT after inserting S_SET_VGPR_MSB (PR #188527)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 25 09:55:39 PDT 2026
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/188527
>From 426f65b1719499855f7a1957f2fa3c2d234cfae1 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 25 Mar 2026 13:42:50 +0000
Subject: [PATCH 1/3] [AMDGPU] Tweak Changed tracking in
AMDGPULowerVGPREncoding. NFC.
Track changed status in a member variable. This frees up the return
value of runOnMachineInstr for other purposes.
---
.../Target/AMDGPU/AMDGPULowerVGPREncoding.cpp | 65 ++++++++++---------
1 file changed, 35 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index 649a113c2fbe5..1361526b2d4ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -134,6 +134,8 @@ class AMDGPULowerVGPREncoding {
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
+ bool Changed = false;
+
// Current basic block.
MachineBasicBlock *MBB;
@@ -160,8 +162,8 @@ class AMDGPULowerVGPREncoding {
/// must be preceded by S_NOP to avoid the hazard.
bool NeedNopBeforeSetVGPRMSB;
- /// Insert mode change before \p I. \returns true if mode was changed.
- bool setMode(ModeTy NewMode, MachineBasicBlock::instr_iterator I);
+ /// Insert mode change before \p I.
+ void setMode(ModeTy NewMode, MachineBasicBlock::instr_iterator I);
/// Reset mode to default.
void resetMode(MachineBasicBlock::instr_iterator I) {
@@ -174,8 +176,8 @@ class AMDGPULowerVGPREncoding {
/// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
- /// Handle single \p MI. \return true if changed.
- bool runOnMachineInstr(MachineInstr &MI);
+ /// Handle single \p MI.
+ void runOnMachineInstr(MachineInstr &MI);
/// Compute the mode for a single \p MI given \p Ops operands
/// bit mapping. Optionally takes second array \p Ops2 for VOPD.
@@ -199,16 +201,15 @@ class AMDGPULowerVGPREncoding {
/// Handle S_SETREG_IMM32_B32 targeting MODE register. On certain hardware,
/// this instruction clobbers VGPR MSB bits[12:19], so we need to restore
- /// the current mode. \returns true if the instruction was modified or a
- /// new one was inserted.
- bool handleSetregMode(MachineInstr &MI);
+ /// the current mode.
+ void handleSetregMode(MachineInstr &MI);
/// Update bits[12:19] of the imm operand in S_SETREG_IMM32_B32 to contain
- /// the VGPR MSB mode value. \returns true if the immediate was changed.
- bool updateSetregModeImm(MachineInstr &MI, int64_t ModeValue);
+ /// the VGPR MSB mode value.
+ void updateSetregModeImm(MachineInstr &MI, int64_t ModeValue);
};
-bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
+void AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
MachineBasicBlock::instr_iterator I) {
LLVM_DEBUG({
dbgs() << " setMode: NewMode=";
@@ -228,7 +229,7 @@ bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
bool Rewritten = false;
if (!CurrentMode.update(NewMode, Rewritten)) {
LLVM_DEBUG(dbgs() << " -> no change needed\n");
- return false;
+ return;
}
LLVM_DEBUG(dbgs() << " Rewritten=" << Rewritten << " after update\n");
@@ -241,6 +242,7 @@ bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
// Carry old mode bits from the existing instruction.
int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
Op.setImm(CurrentMode.encode() | OldModeBits);
+ Changed = true;
LLVM_DEBUG(dbgs() << " -> piggybacked onto S_SET_VGPR_MSB: "
<< *MostRecentModeSet);
} else {
@@ -251,7 +253,7 @@ bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
<< *MostRecentModeSet);
}
- return true;
+ return;
}
I = handleClause(I);
@@ -266,11 +268,11 @@ bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
}
MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
.addImm(NewMode.encode() | OldModeBits);
+ Changed = true;
LLVM_DEBUG(dbgs() << " -> inserted new S_SET_VGPR_MSB: "
<< *MostRecentModeSet);
CurrentMode = NewMode;
- return true;
}
std::optional<unsigned>
@@ -334,7 +336,7 @@ void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode,
}
}
-bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
+void AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
if (Ops.first) {
ModeTy NewMode;
@@ -363,17 +365,17 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
bool Unused = false;
CurrentMode.update(NewModeCommuted, Unused);
// MI was modified by the commute above.
- return true;
+ Changed = true;
+ return;
}
// Commute back.
if (!TII->commuteInstruction(MI))
llvm_unreachable("Failed to restore commuted instruction.");
}
- return setMode(NewMode, MI.getIterator());
+ setMode(NewMode, MI.getIterator());
+ return;
}
assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
-
- return false;
}
MachineBasicBlock::instr_iterator
@@ -401,8 +403,10 @@ AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
// If it does not clause will just become shorter. Since the length
// recorded in the clause is one less, increment the length after the
// update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
- if (ClauseLen < 63)
+ if (ClauseLen < 63) {
Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8));
+ Changed = true;
+ }
++ClauseLen;
@@ -439,7 +443,7 @@ static int64_t convertModeToSetregFormat(int64_t Mode) {
return llvm::rotl<uint8_t>(static_cast<uint8_t>(Mode), /*R=*/2);
}
-bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI,
+void AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI,
int64_t ModeValue) {
assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32);
@@ -450,11 +454,13 @@ bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI,
int64_t OldImm = ImmOp->getImm();
int64_t NewImm =
(OldImm & ~AMDGPU::Hwreg::VGPR_MSB_MASK) | (SetregMode << VGPRMSBShift);
+ if (NewImm == OldImm)
+ return;
ImmOp->setImm(NewImm);
- return NewImm != OldImm;
+ Changed = true;
}
-bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
+void AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
using namespace AMDGPU::Hwreg;
assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
@@ -471,7 +477,7 @@ bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
<< " Size=" << Size << '\n');
if (HwRegId != ID_MODE) {
LLVM_DEBUG(dbgs() << " -> not ID_MODE, skipping\n");
- return false;
+ return;
}
int64_t ModeValue = CurrentMode.encode();
@@ -494,10 +500,10 @@ bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
// the required mode into bits[12:19] without triggering Rewritten.
MostRecentModeSet = &MI;
CurrentMode = {};
- bool Changed = updateSetregModeImm(MI, 0);
+ updateSetregModeImm(MI, 0);
LLVM_DEBUG(dbgs() << " -> reset CurrentMode, cleared bits[12:19]: "
<< MI);
- return Changed;
+ return;
}
// Case 2: Size > 12 - the original instruction uses bits beyond 11, so we
@@ -521,7 +527,7 @@ bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
NeedNopBeforeSetVGPRMSB = true;
LLVM_DEBUG(dbgs() << " -> bits[12:19] already correct, "
"invalidated MostRecentModeSet\n");
- return false;
+ return;
}
// imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after
@@ -533,9 +539,9 @@ bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
MostRecentModeSet = BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
TII->get(AMDGPU::S_SET_VGPR_MSB))
.addImm(ModeValue);
+ Changed = true;
LLVM_DEBUG(dbgs() << " -> inserted S_SET_VGPR_MSB after setreg: "
<< *MostRecentModeSet);
- return true;
}
bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
@@ -549,7 +555,6 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "*** AMDGPULowerVGPREncoding on " << MF.getName()
<< " ***\n");
- bool Changed = false;
ClauseLen = ClauseRemaining = 0;
CurrentMode = {};
for (auto &MBB : MF) {
@@ -596,11 +601,11 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
ST.hasSetregVGPRMSBFixup()) {
- Changed |= handleSetregMode(MI);
+ handleSetregMode(MI);
continue;
}
- Changed |= runOnMachineInstr(MI);
+ runOnMachineInstr(MI);
NeedNopBeforeSetVGPRMSB = false;
if (ClauseRemaining)
>From c2d655e03c86552b2d2dc82fa5656efa866cd1ee Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 25 Mar 2026 14:49:36 +0000
Subject: [PATCH 2/3] [AMDGPU] Remove redundant S_WAIT_XCNT after inserting
S_SET_VGPR_MSB
---
.../Target/AMDGPU/AMDGPULowerVGPREncoding.cpp | 60 ++++++++++++++-----
.../CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir | 33 ++++++++++
.../CodeGen/AMDGPU/vgpr-set-msb-coissue.mir | 1 -
3 files changed, 78 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index 1361526b2d4ad..4dc8481ef5c9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -162,8 +162,9 @@ class AMDGPULowerVGPREncoding {
/// must be preceded by S_NOP to avoid the hazard.
bool NeedNopBeforeSetVGPRMSB;
- /// Insert mode change before \p I.
- void setMode(ModeTy NewMode, MachineBasicBlock::instr_iterator I);
+ /// Insert mode change before \p I. \return true if we inserted a new
+ /// S_SET_VGPR_MSB instruction.
+ bool setMode(ModeTy NewMode, MachineBasicBlock::instr_iterator I);
/// Reset mode to default.
void resetMode(MachineBasicBlock::instr_iterator I) {
@@ -176,8 +177,9 @@ class AMDGPULowerVGPREncoding {
/// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
- /// Handle single \p MI.
- void runOnMachineInstr(MachineInstr &MI);
+ /// Handle single \p MI. \return true if we inserted a new S_SET_VGPR_MSB
+ /// instruction.
+ bool runOnMachineInstr(MachineInstr &MI);
/// Compute the mode for a single \p MI given \p Ops operands
/// bit mapping. Optionally takes second array \p Ops2 for VOPD.
@@ -209,7 +211,7 @@ class AMDGPULowerVGPREncoding {
void updateSetregModeImm(MachineInstr &MI, int64_t ModeValue);
};
-void AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
+bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
MachineBasicBlock::instr_iterator I) {
LLVM_DEBUG({
dbgs() << " setMode: NewMode=";
@@ -229,7 +231,7 @@ void AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
bool Rewritten = false;
if (!CurrentMode.update(NewMode, Rewritten)) {
LLVM_DEBUG(dbgs() << " -> no change needed\n");
- return;
+ return false;
}
LLVM_DEBUG(dbgs() << " Rewritten=" << Rewritten << " after update\n");
@@ -253,26 +255,37 @@ void AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
<< *MostRecentModeSet);
}
- return;
+ return false;
}
- I = handleClause(I);
- I = handleCoissue(I);
+ MachineBasicBlock::instr_iterator InsertPt = handleClause(I);
+ InsertPt = handleCoissue(InsertPt);
// Case 2 match in handleSetregMode: the setreg's imm[12:19] matched
// current MSBs, but the next VALU needs different MSBs, so this
// S_SET_VGPR_MSB would land right after the setreg. Insert S_NOP to
// prevent it from being silently dropped.
if (NeedNopBeforeSetVGPRMSB) {
- BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_NOP)).addImm(0);
+ BuildMI(*MBB, InsertPt, {}, TII->get(AMDGPU::S_NOP)).addImm(0);
NeedNopBeforeSetVGPRMSB = false;
}
- MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
- .addImm(NewMode.encode() | OldModeBits);
+ MostRecentModeSet =
+ BuildMI(*MBB, InsertPt, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
+ .addImm(NewMode.encode() | OldModeBits);
Changed = true;
LLVM_DEBUG(dbgs() << " -> inserted new S_SET_VGPR_MSB: "
<< *MostRecentModeSet);
+ // If we inserted S_SET_VGPR_MSB early then XCNT should remain zero from the
+ // insertion point to the current instruction. Remove any redundant
+ // S_WAIT_XCNT instructions in that range.
+ for (MachineInstr &MI : make_early_inc_range(make_range(InsertPt, I))) {
+ assert(!SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isSMRD(MI));
+ if (MI.getOpcode() == AMDGPU::S_WAIT_XCNT)
+ MI.eraseFromBundle();
+ }
+
CurrentMode = NewMode;
+ return true;
}
std::optional<unsigned>
@@ -336,7 +349,7 @@ void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode,
}
}
-void AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
+bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
if (Ops.first) {
ModeTy NewMode;
@@ -366,7 +379,7 @@ void AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
CurrentMode.update(NewModeCommuted, Unused);
// MI was modified by the commute above.
Changed = true;
- return;
+ return false;
}
// Commute back.
if (!TII->commuteInstruction(MI))
@@ -376,6 +389,7 @@ void AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
return;
}
assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
+ return false;
}
MachineBasicBlock::instr_iterator
@@ -562,6 +576,10 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
NeedNopBeforeSetVGPRMSB = false;
this->MBB = &MBB;
+ // Remember whether XCNT is known to be zero because of an S_SET_VGPR_MSB
+ // instruction that we inserted, which implicitly waits for XCNT==0.
+ bool XCntIsZero = false;
+
LLVM_DEBUG(dbgs() << "BB#" << MBB.getNumber() << ' ' << MBB.getName()
<< ":\n");
@@ -605,9 +623,21 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
continue;
}
- runOnMachineInstr(MI);
+ // If XCNT is known to be zero then any S_WAIT_XCNT instruction is
+ // redundant and can be removed.
+ if (MI.getOpcode() == AMDGPU::S_WAIT_XCNT && XCntIsZero) {
+ MI.eraseFromBundle();
+ Changed = true;
+ continue;
+ }
+
+ XCntIsZero |= runOnMachineInstr(MI);
NeedNopBeforeSetVGPRMSB = false;
+ // Any VMEM or SMEM instruction may increment XCNT.
+ if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSMRD(MI))
+ XCntIsZero = false;
+
if (ClauseRemaining)
--ClauseRemaining;
}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
index 1215bc7364343..fa673027fcbf0 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
@@ -1010,3 +1010,36 @@ body: |
; ASM: NumVgprs: 514
...
+
+# ASM-LABEL: {{^}}redundant_xcnt:
+# DIS-LABEL: <redundant_xcnt>:
+---
+name: redundant_xcnt
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; ASM: %bb.0:
+
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr256, $vgpr257
+
+ ; GCN-NEXT: global_load_b32 v5, v[0:1], off
+ $vgpr5 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+
+ ; GCN-NEXT: global_load_b32 v5, v[2:3], off
+ $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+
+ S_WAIT_XCNT 1
+
+ ; GCN-NEXT: s_set_vgpr_msb 4
+ ; ASM-SAME: ; msbs: dst=0 src0=0 src1=1 src2=0
+ ; GCN-NEXT: v_add_f32_e32 v0, 1, v0 /*v256*/
+ $vgpr0 = V_ADD_F32_e32 1, $vgpr256, implicit $mode, implicit $exec
+
+ S_WAIT_XCNT 0
+
+ ; GCN-NEXT: v_add_f32_e32 v2, 1, v1 /*v257*/
+ $vgpr2 = V_ADD_F32_e32 1, $vgpr257, implicit $mode, implicit $exec
+
+ ; GCN-NEXT: s_set_vgpr_msb 0x400
+ ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-set-msb-coissue.mir b/llvm/test/CodeGen/AMDGPU/vgpr-set-msb-coissue.mir
index c5b7ebe0c78f3..a6666b308437f 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-set-msb-coissue.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-set-msb-coissue.mir
@@ -96,7 +96,6 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr11 = nofpexcept V_EXP_F32_e32 killed $vgpr10, implicit $mode, implicit $exec
; CHECK-NEXT: S_SET_VGPR_MSB 65, implicit-def $mode
- ; CHECK-NEXT: S_WAIT_XCNT 0
; CHECK-NEXT: $vgpr256 = nofpexcept V_EXP_F32_e32 killed $vgpr257, implicit $mode, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
$vgpr11 = nofpexcept V_EXP_F32_e32 killed $vgpr10, implicit $mode, implicit $exec
>From f638cc1e24f1e90e8461c992d72415d6da507107 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 25 Mar 2026 16:55:20 +0000
Subject: [PATCH 3/3] Fix build failure
---
llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index 4dc8481ef5c9e..1b627f106f14b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -385,8 +385,7 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
if (!TII->commuteInstruction(MI))
llvm_unreachable("Failed to restore commuted instruction.");
}
- setMode(NewMode, MI.getIterator());
- return;
+ return setMode(NewMode, MI.getIterator());
}
assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
return false;
More information about the llvm-commits
mailing list