[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

Mon Feb 5 23:51:09 PST 2024

================
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
                                       bool IsNonTemporal) const override;
 };
 
+class SIPreciseMemorySupport {
+protected:
+  const GCNSubtarget &ST;
+  const SIInstrInfo *TII = nullptr;
+
+  IsaVersion IV;
+
+  SIPreciseMemorySupport(const GCNSubtarget &ST) : ST(ST) {
+    TII = ST.getInstrInfo();
+    IV = getIsaVersion(ST.getCPU());
+  }
+
+public:
+  static std::unique_ptr<SIPreciseMemorySupport> create(const GCNSubtarget &ST);
+
+  virtual bool handleNonAtomic(MachineBasicBlock::iterator &MI) = 0;
+  /// Handles atomic instruction \p MI with \p ret indicating whether \p MI
+  /// returns a result.
+  virtual bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) = 0;
+};
+
+class SIGfx9PreciseMemorySupport : public SIPreciseMemorySupport {
+public:
+  SIGfx9PreciseMemorySupport(const GCNSubtarget &ST)
+      : SIPreciseMemorySupport(ST) {}
+  bool handleNonAtomic(MachineBasicBlock::iterator &MI) override;
+  bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override;
+};
+
+class SIGfx10And11PreciseMemorySupport : public SIPreciseMemorySupport {
+public:
+  SIGfx10And11PreciseMemorySupport(const GCNSubtarget &ST)
+      : SIPreciseMemorySupport(ST) {}
+  bool handleNonAtomic(MachineBasicBlock::iterator &MI) override;
+  bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override;
+};
+
+std::unique_ptr<SIPreciseMemorySupport>
+SIPreciseMemorySupport::create(const GCNSubtarget &ST) {
+  GCNSubtarget::Generation Generation = ST.getGeneration();
+  if (Generation < AMDGPUSubtarget::GFX10)
+    return std::make_unique<SIGfx9PreciseMemorySupport>(ST);
+  return std::make_unique<SIGfx10And11PreciseMemorySupport>(ST);
+}
+
+bool SIGfx9PreciseMemorySupport ::handleNonAtomic(
+    MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  AMDGPU::Waitcnt Wait;
+
+  if (TII->isSMRD(Inst)) { // scalar
+    if (Inst.mayStore())
+      return false;
+    Wait.DsCnt = 0;                   // LgkmCnt
+  } else {                            // vector
+    if (Inst.mayLoad()) {             // vector load
+      if (TII->isVMEM(Inst)) {        // VMEM load
+        Wait.LoadCnt = 0;             // VmCnt
+      } else if (TII->isFLAT(Inst)) { // Flat load
+        Wait.LoadCnt = 0;             // VmCnt
+        Wait.DsCnt = 0;               // LgkmCnt
+      } else {                        // LDS load
+        Wait.DsCnt = 0;               // LgkmCnt
+      }
+    } else {                          // vector store
+      if (TII->isVMEM(Inst)) {        // VMEM store
+        Wait.LoadCnt = 0;             // VmCnt
+      } else if (TII->isFLAT(Inst)) { // Flat store
+        Wait.LoadCnt = 0;             // VmCnt
+        Wait.DsCnt = 0;               // LgkmCnt
+      } else {
+        Wait.DsCnt = 0; // LDS store; LgkmCnt
+      }
+    }
+  }
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  return true;
+}
+
+bool SIGfx9PreciseMemorySupport ::handleAtomic(MachineBasicBlock::iterator &MI,
+                                               bool ret) {
+  assert(MI->mayLoadOrStore());
+
+  AMDGPU::Waitcnt Wait;
+
+  Wait.LoadCnt = 0; // VmCnt
+  Wait.DsCnt = 0;   // LgkmCnt
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  return true;
+}
+
+bool SIGfx10And11PreciseMemorySupport ::handleNonAtomic(
+    MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  AMDGPU::Waitcnt Wait;
+
+  bool BuildWaitCnt = true;
+  bool BuildVsCnt = false;
+
+  if (TII->isSMRD(Inst)) { // scalar
+    if (Inst.mayStore())
+      return false;
+    Wait.DsCnt = 0;                   // LgkmCnt
+  } else {                            // vector
+    if (Inst.mayLoad()) {             // vector load
+      if (TII->isVMEM(Inst)) {        // VMEM load
+        Wait.LoadCnt = 0;             // VmCnt
+      } else if (TII->isFLAT(Inst)) { // Flat load
+        Wait.LoadCnt = 0;             // VmCnt
+        Wait.DsCnt = 0;               // LgkmCnt
+      } else {                        // LDS load
+        Wait.DsCnt = 0;               // LgkmCnt
+      }
+    }
+
+    // For some instructions, mayLoad() and mayStore() can be both true.
+    if (Inst.mayStore()) {     // vector store; an instruction can be both
+                               // load/store
+      if (TII->isVMEM(Inst)) { // VMEM store
+        if (!Inst.mayLoad())
+          BuildWaitCnt = false;
+        BuildVsCnt = true;
+      } else if (TII->isFLAT(Inst)) { // Flat store
+        Wait.DsCnt = 0;               // LgkmCnt
+        BuildVsCnt = true;
+      } else {
+        Wait.DsCnt = 0; // LDS store; LgkmCnt
+      }
+    }
+  }
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  if (BuildWaitCnt) {
+    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+    --MI;
+  }
+
+  if (BuildVsCnt) {
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+        .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+        .addImm(0);
+    --MI;
+  }
+  return true;
+}
+
+bool SIGfx10And11PreciseMemorySupport ::handleAtomic(
+    MachineBasicBlock::iterator &MI, bool ret) {
----------------
Pierre-vh wrote:

Use something more descriptive than `ret` + use CamelCase, e.g. `IsReturningAtomic`


https://github.com/llvm/llvm-project/pull/79236