[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

Tue Feb 27 00:26:34 PST 2024

================
@@ -2378,6 +2409,215 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx6CacheControl::handleNonAtomicForPreciseMemory(
+    MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  AMDGPU::Waitcnt Wait;
+
+  if (TII->isSMRD(Inst)) { // scalar
+    if (Inst.mayStore())
+      return false;
+    Wait.DsCnt = 0;                 // LgkmCnt
+  } else {                          // vector
+    if (Inst.mayLoad()) {           // vector load
+      if (TII->isVMEM(Inst))        // VMEM load
+        Wait.LoadCnt = 0;           // VmCnt
+      else if (TII->isFLAT(Inst)) { // Flat load
+        Wait.LoadCnt = 0;           // VmCnt
+        Wait.DsCnt = 0;             // LgkmCnt
+      } else                        // LDS load
+        Wait.DsCnt = 0;             // LgkmCnt
+    } else {                        // vector store
+      if (TII->isVMEM(Inst))        // VMEM store
+        Wait.LoadCnt = 0;           // VmCnt
+      else if (TII->isFLAT(Inst)) { // Flat store
+        Wait.LoadCnt = 0;           // VmCnt
+        Wait.DsCnt = 0;             // LgkmCnt
+      } else
+        Wait.DsCnt = 0; // LDS store; LgkmCnt
+    }
+  }
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  return true;
+}
+
+bool SIGfx6CacheControl::handleAtomicForPreciseMemory(
+    MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) {
+  assert(MI->mayLoadOrStore());
+
+  AMDGPU::Waitcnt Wait;
+
+  Wait.LoadCnt = 0; // VmCnt
+  Wait.DsCnt = 0;   // LgkmCnt
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  return true;
+}
+
+bool SIGfx10CacheControl::handleNonAtomicForPreciseMemory(
+    MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  AMDGPU::Waitcnt Wait;
+
+  bool BuildWaitCnt = true;
+  bool BuildVsCnt = false;
+
+  if (TII->isSMRD(Inst)) { // scalar
+    if (Inst.mayStore())
+      return false;
+    Wait.DsCnt = 0;                 // LgkmCnt
+  } else {                          // vector
+    if (Inst.mayLoad()) {           // vector load
+      if (TII->isVMEM(Inst))        // VMEM load
+        Wait.LoadCnt = 0;           // VmCnt
+      else if (TII->isFLAT(Inst)) { // Flat load
+        Wait.LoadCnt = 0;           // VmCnt
+        Wait.DsCnt = 0;             // LgkmCnt
+      } else                        // LDS load
+        Wait.DsCnt = 0;             // LgkmCnt
+    }
+
+    // For some vector instructions, mayLoad() and mayStore() can be both true.
+    if (Inst.mayStore()) {     // vector store; an instruction can be both
+                               // load/store
+      if (TII->isVMEM(Inst)) { // VMEM store
+        if (!Inst.mayLoad())
+          BuildWaitCnt = false;
+        BuildVsCnt = true;
+      } else if (TII->isFLAT(Inst)) { // Flat store
+        Wait.DsCnt = 0;               // LgkmCnt
+        BuildVsCnt = true;
+      } else
+        Wait.DsCnt = 0; // LDS store; LgkmCnt
+    }
+  }
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  if (BuildWaitCnt) {
+    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+    --MI;
+  }
+
+  if (BuildVsCnt) {
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+        .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+        .addImm(0);
+    --MI;
+  }
+  return true;
+}
+
+bool SIGfx10CacheControl ::handleAtomicForPreciseMemory(
+    MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) {
+  assert(MI->mayLoadOrStore());
+
+  AMDGPU::Waitcnt Wait;
+
+  Wait.DsCnt = 0; // LgkmCnt
+  if (IsAtomicWithRet)
+    Wait.LoadCnt = 0; // VmCnt
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  if (!IsAtomicWithRet) {
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+        .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+        .addImm(0);
+    --MI;
+  }
+  return true;
+}
+
+bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory(
+    MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  unsigned WaitType = 0;
+  // For some vector instructions, mayLoad() and mayStore() can be both true.
+  bool LoadAndStore = false;
+
+  if (TII->isSMRD(Inst)) { // scalar
+    if (Inst.mayStore())
+      return false;
+
+    WaitType = AMDGPU::S_WAIT_KMCNT;
+  } else { // vector
+    if (Inst.mayLoad() && Inst.mayStore()) {
+      WaitType = AMDGPU::S_WAIT_LOADCNT;
+      LoadAndStore = true;
+    } else if (Inst.mayLoad()) { // vector load
+      if (TII->isVMEM(Inst))     // VMEM load
+        WaitType = AMDGPU::S_WAIT_LOADCNT;
----------------
jayfoad wrote:

What about SAMPLECNT and BVHCNT?

https://github.com/llvm/llvm-project/pull/79236