[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
Jun Wang via cfe-commits
cfe-commits at lists.llvm.org
Wed Feb 7 11:52:24 PST 2024
================
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsNonTemporal) const override;
};
+class SIPreciseMemorySupport {
+protected:
+ const GCNSubtarget &ST;
+ const SIInstrInfo *TII = nullptr;
+
+ IsaVersion IV;
+
+ SIPreciseMemorySupport(const GCNSubtarget &ST) : ST(ST) {
+ TII = ST.getInstrInfo();
+ IV = getIsaVersion(ST.getCPU());
+ }
+
+public:
+ static std::unique_ptr<SIPreciseMemorySupport> create(const GCNSubtarget &ST);
+
+ virtual bool handleNonAtomic(MachineBasicBlock::iterator &MI) = 0;
+ /// Handles atomic instruction \p MI with \p ret indicating whether \p MI
+ /// returns a result.
+ virtual bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) = 0;
+};
+
+class SIGfx9PreciseMemorySupport : public SIPreciseMemorySupport {
+public:
+ SIGfx9PreciseMemorySupport(const GCNSubtarget &ST)
+ : SIPreciseMemorySupport(ST) {}
+ bool handleNonAtomic(MachineBasicBlock::iterator &MI) override;
+ bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override;
+};
+
+class SIGfx10And11PreciseMemorySupport : public SIPreciseMemorySupport {
+public:
+ SIGfx10And11PreciseMemorySupport(const GCNSubtarget &ST)
+ : SIPreciseMemorySupport(ST) {}
+ bool handleNonAtomic(MachineBasicBlock::iterator &MI) override;
+ bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override;
+};
+
+std::unique_ptr<SIPreciseMemorySupport>
+SIPreciseMemorySupport::create(const GCNSubtarget &ST) {
+ GCNSubtarget::Generation Generation = ST.getGeneration();
+ if (Generation < AMDGPUSubtarget::GFX10)
+ return std::make_unique<SIGfx9PreciseMemorySupport>(ST);
+ return std::make_unique<SIGfx10And11PreciseMemorySupport>(ST);
+}
+
+bool SIGfx9PreciseMemorySupport ::handleNonAtomic(
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr &Inst = *MI;
+ AMDGPU::Waitcnt Wait;
+
+ if (TII->isSMRD(Inst)) { // scalar
+ if (Inst.mayStore())
+ return false;
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // vector
+ if (Inst.mayLoad()) { // vector load
+ if (TII->isVMEM(Inst)) { // VMEM load
+ Wait.LoadCnt = 0; // VmCnt
+ } else if (TII->isFLAT(Inst)) { // Flat load
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // LDS load
+ Wait.DsCnt = 0; // LgkmCnt
+ }
+ } else { // vector store
+ if (TII->isVMEM(Inst)) { // VMEM store
+ Wait.LoadCnt = 0; // VmCnt
+ } else if (TII->isFLAT(Inst)) { // Flat store
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+ } else {
+ Wait.DsCnt = 0; // LDS store; LgkmCnt
+ }
+ }
+ }
+
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ MachineBasicBlock &MBB = *MI->getParent();
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ --MI;
+ return true;
+}
+
+bool SIGfx9PreciseMemorySupport ::handleAtomic(MachineBasicBlock::iterator &MI,
+ bool ret) {
+ assert(MI->mayLoadOrStore());
+
+ AMDGPU::Waitcnt Wait;
+
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ MachineBasicBlock &MBB = *MI->getParent();
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ --MI;
+ return true;
+}
+
+bool SIGfx10And11PreciseMemorySupport ::handleNonAtomic(
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr &Inst = *MI;
+ AMDGPU::Waitcnt Wait;
+
+ bool BuildWaitCnt = true;
+ bool BuildVsCnt = false;
+
+ if (TII->isSMRD(Inst)) { // scalar
+ if (Inst.mayStore())
+ return false;
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // vector
+ if (Inst.mayLoad()) { // vector load
+ if (TII->isVMEM(Inst)) { // VMEM load
+ Wait.LoadCnt = 0; // VmCnt
+ } else if (TII->isFLAT(Inst)) { // Flat load
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // LDS load
+ Wait.DsCnt = 0; // LgkmCnt
+ }
+ }
+
+ // For some instructions, mayLoad() and mayStore() can be both true.
+ if (Inst.mayStore()) { // vector store; an instruction can be both
+ // load/store
+ if (TII->isVMEM(Inst)) { // VMEM store
+ if (!Inst.mayLoad())
+ BuildWaitCnt = false;
+ BuildVsCnt = true;
+ } else if (TII->isFLAT(Inst)) { // Flat store
+ Wait.DsCnt = 0; // LgkmCnt
+ BuildVsCnt = true;
+ } else {
+ Wait.DsCnt = 0; // LDS store; LgkmCnt
+ }
+ }
+ }
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ if (BuildWaitCnt) {
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ --MI;
+ }
+
+ if (BuildVsCnt) {
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+ --MI;
+ }
+ return true;
+}
+
+bool SIGfx10And11PreciseMemorySupport ::handleAtomic(
+ MachineBasicBlock::iterator &MI, bool ret) {
----------------
jwanggit86 wrote:
Like the existing functions, the return value indicates whether there's any change to the basic block, e.g., inserting a new instruction.
https://github.com/llvm/llvm-project/pull/79236
More information about the cfe-commits
mailing list