[llvm] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (PR #77438)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 9 02:30:20 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
Update SIMemoryLegalizer and SIInsertWaitcnts to use separate wait
instructions per counter (e.g. S_WAIT_LOADCNT) and split VMCNT into
separate LOADCNT, SAMPLECNT and BVHCNT counters.
---
Patch is 1.32 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77438.diff
101 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+2-1)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+4)
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+803-217)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+1-2)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+16-21)
- (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+180)
- (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+7)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+143-3)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+98-25)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll (+40-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll (+85-17)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll (+18-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+129-91)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll (+90-18)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll (+27-11)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll (+42-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll (+42-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll (+40-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll (+90-50)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll (+42-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+51-15)
- (modified) llvm/test/CodeGen/AMDGPU/add.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+46-46)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+64-64)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+40-40)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+48-48)
- (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+30-15)
- (modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+92-92)
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll (+21-9)
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll (+66-18)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll (+36-36)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll (+126-126)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+380-268)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+238-293)
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/fminimum.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+188-188)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-store.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+211-211)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll (+27-27)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll (+27-27)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll (+54-54)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll (+8-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll (+39-39)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll (+62-62)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll (+7-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll (+9-5)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+38-38)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll (+52-52)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll (+43-43)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll (+133-85)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll (+36-36)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll (+11-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll (+26-26)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-f32.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-f64.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+88-88)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+79-79)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+50-50)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i64.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+104-104)
- (modified) llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+38-34)
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-flat.ll (+369-197)
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-global.ll (+369-197)
- (modified) llvm/test/CodeGen/AMDGPU/readcyclecounter.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/sub.ll (+27-27)
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir (+1-1)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a7d8ff0242b801..7ca7722a5cebd1 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1242,7 +1242,8 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
case AMDGPU::S_WAITCNT: {
const int64_t Imm = MI.getOperand(0).getImm();
AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
- return (Decoded.LgkmCnt == 0);
+ // DsCnt corresponds to LGKMCnt here.
+ return (Decoded.DsCnt == 0);
}
default:
// SOPP instructions cannot mitigate the hazard.
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f6f37f5170a403..f3d38c018d2709 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1175,6 +1175,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
+ /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
+ /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
+ bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1cb1d32707f2d7..37890881d57544 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -57,7 +57,18 @@ namespace {
// associated with the operand. Used for determining whether
// s_waitcnt instruction needs to be emitted.
-enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
+enum InstCounterType {
+ LOAD_CNT = 0, // VMcnt prior to gfx12.
+ DS_CNT, // LKGMcnt prior to gfx12.
+ EXP_CNT, //
+ STORE_CNT, // VScnt in gfx10/gfx11.
+ NUM_NORMAL_INST_CNTS,
+ SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
+ BVH_CNT, // gfx12+ only.
+ KM_CNT, // gfx12+ only.
+ NUM_EXTENDED_INST_CNTS,
+ NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
+};
} // namespace
namespace llvm {
@@ -67,15 +78,23 @@ template <> struct enum_iteration_traits<InstCounterType> {
} // namespace llvm
namespace {
-auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
+// Return an iterator over all counters between LOAD_CNT (the first counter)
+// and \c MaxCounter (exclusive, default value yields an enumeration over
+// all counters).
+auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
+ return enum_seq(LOAD_CNT, MaxCounter);
+}
using RegInterval = std::pair<int, int>;
struct HardwareLimits {
- unsigned VmcntMax;
+ unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
unsigned ExpcntMax;
- unsigned LgkmcntMax;
- unsigned VscntMax;
+ unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
+ unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
+ unsigned SamplecntMax; // gfx12+ only.
+ unsigned BvhcntMax; // gfx12+ only.
+ unsigned KmcntMax; // gfx12+ only.
};
struct RegisterEncoding {
@@ -86,31 +105,25 @@ struct RegisterEncoding {
};
enum WaitEventType {
- VMEM_ACCESS, // vector-memory read & write
- VMEM_READ_ACCESS, // vector-memory read
- VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
- SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
- LDS_ACCESS, // lds read & write
- GDS_ACCESS, // gds read & write
- SQ_MESSAGE, // send message
- SMEM_ACCESS, // scalar-memory read & write
- EXP_GPR_LOCK, // export holding on its data src
- GDS_GPR_LOCK, // GDS holding on its data and addr src
- EXP_POS_ACCESS, // write to export position
- EXP_PARAM_ACCESS, // write to export parameter
- VMW_GPR_LOCK, // vector-memory write holding on its data src
- EXP_LDS_ACCESS, // read by ldsdir counting as export
+ VMEM_ACCESS, // vector-memory read & write
+ VMEM_READ_ACCESS, // vector-memory read
+ VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
+ VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
+ VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
+ SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
+ LDS_ACCESS, // lds read & write
+ GDS_ACCESS, // gds read & write
+ SQ_MESSAGE, // send message
+ SMEM_ACCESS, // scalar-memory read & write
+ EXP_GPR_LOCK, // export holding on its data src
+ GDS_GPR_LOCK, // GDS holding on its data and addr src
+ EXP_POS_ACCESS, // write to export position
+ EXP_PARAM_ACCESS, // write to export parameter
+ VMW_GPR_LOCK, // vector-memory write holding on its data src
+ EXP_LDS_ACCESS, // read by ldsdir counting as export
NUM_WAIT_EVENTS,
};
-static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
- (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
- (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
- (1 << SQ_MESSAGE),
- (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
- (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
- (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)};
-
// The mapping is:
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
@@ -137,17 +150,33 @@ enum VmemType {
// MIMG instructions with a sampler.
VMEM_SAMPLER,
// BVH instructions
- VMEM_BVH
+ VMEM_BVH,
+ NUM_VMEM_TYPES
};
+// Maps values of InstCounterType to the instruction that waits on that
+// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
+// returns true.
+static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
+ AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
+ AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
+ AMDGPU::S_WAIT_KMCNT};
+
static bool updateVMCntOnly(const MachineInstr &Inst) {
return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
SIInstrInfo::isFLATScratch(Inst);
}
+#ifndef NDEBUG
+static bool isNormalMode(InstCounterType MaxCounter) {
+ return MaxCounter == NUM_NORMAL_INST_CNTS;
+}
+#endif // NDEBUG
+
VmemType getVmemType(const MachineInstr &Inst) {
assert(updateVMCntOnly(Inst));
- if (!SIInstrInfo::isMIMG(Inst))
+ if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
+ !SIInstrInfo::isVSAMPLE(Inst))
return VMEM_NOSAMPLER;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
@@ -156,25 +185,49 @@ VmemType getVmemType(const MachineInstr &Inst) {
: BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
}
-void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
switch (T) {
- case VM_CNT:
- Wait.VmCnt = std::min(Wait.VmCnt, Count);
- break;
+ case LOAD_CNT:
+ return Wait.LoadCnt;
case EXP_CNT:
- Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
- break;
- case LGKM_CNT:
- Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
- break;
- case VS_CNT:
- Wait.VsCnt = std::min(Wait.VsCnt, Count);
- break;
+ return Wait.ExpCnt;
+ case DS_CNT:
+ return Wait.DsCnt;
+ case STORE_CNT:
+ return Wait.StoreCnt;
+ case SAMPLE_CNT:
+ return Wait.SampleCnt;
+ case BVH_CNT:
+ return Wait.BvhCnt;
+ case KM_CNT:
+ return Wait.KmCnt;
default:
llvm_unreachable("bad InstCounterType");
}
}
+void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+ unsigned &WC = getCounterRef(Wait, T);
+ WC = std::min(WC, Count);
+}
+
+void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+ getCounterRef(Wait, T) = ~0u;
+}
+
+unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+ return getCounterRef(Wait, T);
+}
+
+// Mapping from event to counter according to the table masks.
+InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
+ for (auto T : inst_counter_types()) {
+ if (masks[T] & (1 << E))
+ return T;
+ }
+ llvm_unreachable("event type has no associated counter");
+}
+
// This objects maintains the current score brackets of each wait counter, and
// a per-register scoreboard for each wait counter.
//
@@ -185,20 +238,30 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
// "s_waitcnt 0" before use.
class WaitcntBrackets {
public:
- WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
- RegisterEncoding Encoding)
- : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
+ WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
+ HardwareLimits Limits, RegisterEncoding Encoding,
+ const unsigned *WaitEventMaskForInst,
+ InstCounterType SmemAccessCounter)
+ : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
+ Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
+ SmemAccessCounter(SmemAccessCounter) {}
unsigned getWaitCountMax(InstCounterType T) const {
switch (T) {
- case VM_CNT:
- return Limits.VmcntMax;
- case LGKM_CNT:
- return Limits.LgkmcntMax;
+ case LOAD_CNT:
+ return Limits.LoadcntMax;
+ case DS_CNT:
+ return Limits.DscntMax;
case EXP_CNT:
return Limits.ExpcntMax;
- case VS_CNT:
- return Limits.VscntMax;
+ case STORE_CNT:
+ return Limits.StorecntMax;
+ case SAMPLE_CNT:
+ return Limits.SamplecntMax;
+ case BVH_CNT:
+ return Limits.BvhcntMax;
+ case KM_CNT:
+ return Limits.KmcntMax;
default:
break;
}
@@ -219,20 +282,11 @@ class WaitcntBrackets {
return getScoreUB(T) - getScoreLB(T);
}
- // Mapping from event to counter.
- InstCounterType eventCounter(WaitEventType E) const {
- for (auto T : inst_counter_types()) {
- if (WaitEventMaskForInst[T] & (1 << E))
- return T;
- }
- llvm_unreachable("event type has no associated counter");
- }
-
unsigned getRegScore(int GprNo, InstCounterType T) const {
if (GprNo < NUM_ALL_VGPRS) {
return VgprScores[T][GprNo];
}
- assert(T == LGKM_CNT);
+ assert(T == SmemAccessCounter);
return SgprScores[GprNo - NUM_ALL_VGPRS];
}
@@ -269,15 +323,15 @@ class WaitcntBrackets {
}
bool hasPendingFlat() const {
- return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
- LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
- (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
- LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
+ return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+ LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+ (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+ LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
}
void setPendingFlat() {
- LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
- LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
+ LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+ LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
}
// Return true if there might be pending writes to the specified vgpr by VMEM
@@ -293,8 +347,8 @@ class WaitcntBrackets {
}
void setNonKernelFunctionInitialState() {
- setScoreUB(VS_CNT, getWaitCountMax(VS_CNT));
- PendingEvents |= WaitEventMaskForInst[VS_CNT];
+ setScoreUB(STORE_CNT, getWaitCountMax(STORE_CNT));
+ PendingEvents |= WaitEventMaskForInst[STORE_CNT];
}
void print(raw_ostream &);
@@ -331,7 +385,7 @@ class WaitcntBrackets {
VgprUB = std::max(VgprUB, GprNo);
VgprScores[T][GprNo] = Val;
} else {
- assert(T == LGKM_CNT);
+ assert(T == SmemAccessCounter);
SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
}
@@ -342,8 +396,11 @@ class WaitcntBrackets {
unsigned OpNo, unsigned Val);
const GCNSubtarget *ST = nullptr;
+ InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
HardwareLimits Limits = {};
RegisterEncoding Encoding = {};
+ const unsigned *WaitEventMaskForInst;
+ InstCounterType SmemAccessCounter;
unsigned ScoreLBs[NUM_INST_CNTS] = {0};
unsigned ScoreUBs[NUM_INST_CNTS] = {0};
unsigned PendingEvents = 0;
@@ -354,20 +411,139 @@ class WaitcntBrackets {
int VgprUB = -1;
int SgprUB = -1;
unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
- // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
+ // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
+ // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
// write to each vgpr.
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
};
+// This abstracts the logic for generating and updating S_WAIT* instructions
+// away from the analysis that determines where they are needed. This was
+// done because the set of counters and instructions for waiting on them
+// underwent a major shift with gfx12, sufficiently so that having this
+// abstraction allows the main analysis logic to be simpler than it would
+// otherwise have had to become.
+class WaitcntGenerator {
+protected:
+ const GCNSubtarget *ST = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ AMDGPU::IsaVersion IV;
+ InstCounterType MaxCounter;
+
+public:
+ WaitcntGenerator() {}
+ WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
+ : ST(ST), TII(ST->getInstrInfo()),
+ IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
+
+ // Edits an existing sequence of wait count instructions according
+ // to an incoming Waitcnt value, which is itself updated to reflect
+ // any new wait count instructions which may need to be generated by
+ // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
+ // were made.
+ //
+ // This editing will usually be merely updated operands, but it may also
+ // delete instructions if the incoming Wait value indicates they are not
+ // needed. It may also remove existing instructions for which a wait
+ // is needed if it can be determined that it is better to generate new
+ // instructions later, as can happen on gfx12.
+ virtual bool
+ applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+ MachineBasicBlock::instr_iterator It) const = 0;
+
+ // Transform a soft waitcnt into a normal one.
+ bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
+
+ // Generates new wait count instructions according to the value of
+ // Wait, returning true if any new instructions were created.
+ virtual bool createNewWaitcnt(MachineBasicBlock &Block,
+ MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) = 0;
+
+ // Returns an array of bit masks which can be used to map values in
+ // WaitEventType to corresponding counter values in InstCounterType.
+ virtual const unsigned *getWaitEventMask() const = 0;
+
+ virtual ~WaitcntGenerator() = default;
+};
+
+class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
+public:
+ WaitcntGeneratorPreGFX12() {}
+ WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
+ : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
+
+ bool
+ applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+ MachineBasicBlock::instr_iterator It) const override;
+
+ bool createNewWaitcnt(MachineBasicBlock &Block,
+ MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) override;
+
+ const unsigned *getWaitEventMask() const override {
+ assert(ST);
+
+ static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
+ (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) |
+ (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS),
+ (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+ (1 << SQ_MESSAGE),
+ (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+ (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
+ (1 << EXP_LDS_ACCESS),
+ (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
+ 0,
+ 0,
+ 0};
+
+ return WaitEventMaskForInstPreGFX12;
+ }
+};
+
+class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
+public:
+ WaitcntGeneratorGFX12Plus() {}
+ WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
+ : WaitcntGenerator(ST, MaxCounter) {}
+
+ bool
+ applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+ MachineBasicBlock::instr_iterator It) const override;
+
+ bool createNewWaitcnt(MachineBasicBlock &Block,
+ MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) override;
+
+ const unsigned *getWaitEventMask() const override {
+ assert(ST);
+
+ static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
+ (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
+ (1 << LDS_ACCESS) | (1 << GDS_ACCESS),
+ (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+ (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
+ (1 << EXP_LDS_ACCESS),
+ (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
+ (1 << VMEM_SAMPLER_READ_ACCESS),
+ (1 << VMEM_BVH_READ_ACCESS),
+ (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)};
+
+ return WaitEventMaskForInstGFX12Plus;
+ }
+};
+
class SIInsertWaitcnts : public MachineFunctionPass {
private:
const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
const MachineRegisterInfo *MRI = nullptr;
- AMDGPU::IsaVersion IV;
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
@@ -379,6 +555,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
bool Dirty = true;
};
+ InstCounterType SmemAccessCounter;
+
MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
@@ -388,10 +566,20 @@ class SIInsertWaitcnts : public MachineFunctionPass {
bool OptNone;
+ // In any given run of this pass, WCG will point to one of these two
+ // generator objects, which must have been re-initialised before use
+ // from a value made using a subtarget constructor.
+ WaitcntGeneratorPreGFX12 WCGPreGFX12;
+ WaitcntGeneratorGFX12Plus WCGGFX12Plus;
+
+ WaitcntGenerator *WCG = nullptr;
+
// S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
// message.
DenseSet<MachineInstr *> ReleaseVGPRInsts;
+ InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+
public:
static char ID;
@@ -438,16 +626,22 @@ class SIInsertWaitcnts : public MachineFunctionPass {
if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
DebugCounter::shouldExecute(ForceLgkmCounter)) {
- ForceEmitWaitcnt[LGKM_CNT] = true;
+ ForceEmitWaitcnt[DS_CNT] = true;
+ ForceEmitWaitcnt[KM_CNT] = true;
} else {
- ForceEmitWaitcnt[LGKM_CNT] = false;
+ ForceEmitWaitcnt[DS_CNT] = false;
+ ForceEmitWaitcnt[KM_CNT] = false;
}
if (DebugCounter::isCounterSet(ForceVMCounter) &&
DebugCounter::shouldExecute(ForceVMCounter)) {
- ForceEmitWaitcnt[VM_CNT] = true;
+ ForceEmitWaitcnt[LOAD_CNT] = true;
+ ForceEmitWaitcnt[SAMPLE_CNT] = ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/77438
More information about the llvm-commits
mailing list