[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
Jay Foad via cfe-commits
cfe-commits at lists.llvm.org
Tue Feb 27 00:26:34 PST 2024
================
@@ -2378,6 +2409,215 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
+bool SIGfx6CacheControl::handleNonAtomicForPreciseMemory(
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr &Inst = *MI;
+ AMDGPU::Waitcnt Wait;
+
+ if (TII->isSMRD(Inst)) { // scalar
+ if (Inst.mayStore())
+ return false;
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // vector
+ if (Inst.mayLoad()) { // vector load
+ if (TII->isVMEM(Inst)) // VMEM load
+ Wait.LoadCnt = 0; // VmCnt
+ else if (TII->isFLAT(Inst)) { // Flat load
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+ } else // LDS load
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // vector store
+ if (TII->isVMEM(Inst)) // VMEM store
+ Wait.LoadCnt = 0; // VmCnt
+ else if (TII->isFLAT(Inst)) { // Flat store
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+ } else
+ Wait.DsCnt = 0; // LDS store; LgkmCnt
+ }
+ }
+
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ MachineBasicBlock &MBB = *MI->getParent();
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ --MI;
+ return true;
+}
+
+bool SIGfx6CacheControl::handleAtomicForPreciseMemory(
+ MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) {
+ assert(MI->mayLoadOrStore());
+
+ AMDGPU::Waitcnt Wait;
+
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ MachineBasicBlock &MBB = *MI->getParent();
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ --MI;
+ return true;
+}
+
+bool SIGfx10CacheControl::handleNonAtomicForPreciseMemory(
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr &Inst = *MI;
+ AMDGPU::Waitcnt Wait;
+
+ bool BuildWaitCnt = true;
+ bool BuildVsCnt = false;
+
+ if (TII->isSMRD(Inst)) { // scalar
+ if (Inst.mayStore())
+ return false;
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // vector
+ if (Inst.mayLoad()) { // vector load
+ if (TII->isVMEM(Inst)) // VMEM load
+ Wait.LoadCnt = 0; // VmCnt
+ else if (TII->isFLAT(Inst)) { // Flat load
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+ } else // LDS load
+ Wait.DsCnt = 0; // LgkmCnt
+ }
+
+ // For some vector instructions, mayLoad() and mayStore() can be both true.
+ if (Inst.mayStore()) { // vector store; an instruction can be both
+ // load/store
+ if (TII->isVMEM(Inst)) { // VMEM store
+ if (!Inst.mayLoad())
+ BuildWaitCnt = false;
+ BuildVsCnt = true;
+ } else if (TII->isFLAT(Inst)) { // Flat store
+ Wait.DsCnt = 0; // LgkmCnt
+ BuildVsCnt = true;
+ } else
+ Wait.DsCnt = 0; // LDS store; LgkmCnt
+ }
+ }
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ if (BuildWaitCnt) {
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ --MI;
+ }
+
+ if (BuildVsCnt) {
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+ --MI;
+ }
+ return true;
+}
+
+bool SIGfx10CacheControl ::handleAtomicForPreciseMemory(
+ MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) {
+ assert(MI->mayLoadOrStore());
+
+ AMDGPU::Waitcnt Wait;
+
+ Wait.DsCnt = 0; // LgkmCnt
+ if (IsAtomicWithRet)
+ Wait.LoadCnt = 0; // VmCnt
+
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ MachineBasicBlock &MBB = *MI->getParent();
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ --MI;
+ if (!IsAtomicWithRet) {
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+ --MI;
+ }
+ return true;
+}
+
+bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory(
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr &Inst = *MI;
+ unsigned WaitType = 0;
+ // For some vector instructions, mayLoad() and mayStore() can be both true.
+ bool LoadAndStore = false;
+
+ if (TII->isSMRD(Inst)) { // scalar
+ if (Inst.mayStore())
+ return false;
+
+ WaitType = AMDGPU::S_WAIT_KMCNT;
+ } else { // vector
+ if (Inst.mayLoad() && Inst.mayStore()) {
+ WaitType = AMDGPU::S_WAIT_LOADCNT;
+ LoadAndStore = true;
+ } else if (Inst.mayLoad()) { // vector load
+ if (TII->isVMEM(Inst)) // VMEM load
+ WaitType = AMDGPU::S_WAIT_LOADCNT;
----------------
jayfoad wrote:
What about SAMPLECNT and BVHCNT?
https://github.com/llvm/llvm-project/pull/79236
More information about the cfe-commits
mailing list