[llvm] insertwaitcnt pass update for true16 (PR #128927)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 26 10:10:36 PST 2025
https://github.com/broxigarchen created https://github.com/llvm/llvm-project/pull/128927
None
>From 8f8c844aca2144ea1a5db14be4ad4ec39ad277b3 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Wed, 26 Feb 2025 13:09:03 -0500
Subject: [PATCH] insertwaitcnt pass update for true16
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 48 +++++++++++++--------
1 file changed, 29 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ee263f58bcaf2..d86bf27aa96cd 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -137,10 +137,10 @@ enum WaitEventType {
// We reserve a fixed number of VGPR slots in the scoring tables for
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping {
- SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
- AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
- SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
- NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
+ SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets.
+ AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets.
+ SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
+ NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
// Artificial register slots to track LDS writes into specific LDS locations
// if a location is known. When slots are exhausted or location is
// unknown use the first slot. The first slot is also always updated in
@@ -165,6 +165,17 @@ enum VmemType {
NUM_VMEM_TYPES
};
+static unsigned getRegPoint(MCRegister Reg, const SIRegisterInfo &TRI) {
+ // Order register interval points so that intervals of 32-bit VGPRs
+ // include intervals of their 16-bit halves.
+ MCRegister MCReg = AMDGPU::getMCReg(Reg, TRI.getSubtarget());
+ unsigned RegIdx = TRI.getHWRegIndex(MCReg);
+ bool IsHi = AMDGPU::isHi16Reg(MCReg, TRI);
+ bool IsVector = TRI.isVectorRegister(MCReg);
+ assert(isUInt<8>(RegIdx));
+ return (IsVector ? 0x200 : 0) | (RegIdx << 1) | (IsHi ? 1 : 0);
+}
+
// Maps values of InstCounterType to the instruction that waits on that
// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
// returns true.
@@ -757,30 +768,31 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
RegInterval Result;
- unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
- AMDGPU::HWEncoding::REG_IDX_MASK;
+ unsigned Reg = getRegPoint(Op.getReg(), *TRI);
+ const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ // VGPRs are tracked every 16 bits, SGPRs by 32 bits
if (TRI->isVectorRegister(*MRI, Op.getReg())) {
assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
Result.first = Reg - Encoding.VGPR0;
if (TRI->isAGPR(*MRI, Op.getReg()))
Result.first += AGPR_OFFSET;
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
+ assert(Size % 16 == 0);
+ Result.second = Result.first + (Size / 16);
} else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
- assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
- Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
+ assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS * 2);
+ Result.first = ((Reg - Encoding.SGPR0) >> 1) + NUM_ALL_VGPRS;
assert(Result.first >= NUM_ALL_VGPRS &&
Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
+ Result.second = Result.first + divideCeil(Size, 32);
}
// TODO: Handle TTMP
// else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
else
return {-1, -1};
- const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
- unsigned Size = TRI->getRegSizeInBits(*RC);
- Result.second = Result.first + ((Size + 16) / 32);
-
return Result;
}
@@ -2452,16 +2464,14 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
- assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
+ assert(NumVGPRsMax + AGPR_OFFSET <= SQ_MAX_PGM_VGPRS);
assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
RegisterEncoding Encoding = {};
- Encoding.VGPR0 =
- TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
- Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
- Encoding.SGPR0 =
- TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
- Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
+ Encoding.VGPR0 = getRegPoint(AMDGPU::VGPR0, *TRI);
+ Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax * 2 - 1;
+ Encoding.SGPR0 = getRegPoint(AMDGPU::SGPR0, *TRI);
+ Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax * 2 - 1;
BlockInfos.clear();
bool Modified = false;
More information about the llvm-commits
mailing list