[llvm] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (PR #77438)

Fri Jan 12 04:51:41 PST 2024

================
@@ -354,20 +411,139 @@ class WaitcntBrackets {
   int VgprUB = -1;
   int SgprUB = -1;
   unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
-  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
+  // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
+  // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
   unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
   // write to each vgpr.
   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
 };
 
+// This abstracts the logic for generating and updating S_WAIT* instructions
+// away from the analysis that determines where they are needed. This was
+// done because the set of counters and instructions for waiting on them
+// underwent a major shift with gfx12, sufficiently so that having this
+// abstraction allows the main analysis logic to be simpler than it would
+// otherwise have had to become.
+class WaitcntGenerator {
+protected:
+  const GCNSubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  AMDGPU::IsaVersion IV;
+  InstCounterType MaxCounter;
+
+public:
+  WaitcntGenerator() {}
+  WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
+      : ST(ST), TII(ST->getInstrInfo()),
+        IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
+
+  // Edits an existing sequence of wait count instructions according
+  // to an incoming Waitcnt value, which is itself updated to reflect
+  // any new wait count instructions which may need to be generated by
+  // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
+  // were made.
+  //
+  // This editing will usually be merely updated operands, but it may also
+  // delete instructions if the incoming Wait value indicates they are not
+  // needed. It may also remove existing instructions for which a wait
+  // is needed if it can be determined that it is better to generate new
+  // instructions later, as can happen on gfx12.
+  virtual bool
+  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+                          MachineBasicBlock::instr_iterator It) const = 0;
+
+  // Transform a soft waitcnt into a normal one.
+  bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
+
+  // Generates new wait count instructions according to the  value of
+  // Wait, returning true if any new instructions were created.
+  virtual bool createNewWaitcnt(MachineBasicBlock &Block,
+                                MachineBasicBlock::instr_iterator It,
+                                AMDGPU::Waitcnt Wait) = 0;
+
+  // Returns an array of bit masks which can be used to map values in
+  // WaitEventType to corresponding counter values in InstCounterType.
+  virtual const unsigned *getWaitEventMask() const = 0;
+
+  virtual ~WaitcntGenerator() = default;
+};
+
+class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
+public:
+  WaitcntGeneratorPreGFX12() {}
+  WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
+      : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
+
+  bool
+  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+                          MachineBasicBlock::instr_iterator It) const override;
+
+  bool createNewWaitcnt(MachineBasicBlock &Block,
+                        MachineBasicBlock::instr_iterator It,
+                        AMDGPU::Waitcnt Wait) override;
+
+  const unsigned *getWaitEventMask() const override {
+    assert(ST);
+
+    static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
----------------
Pierre-vh wrote:

Maybe out of scope for this patch, but could we rewrite this array to make it clearer? I find it hard to read in its current state.
Maybe have some constexpr helper so each entry can be something like `getEvenMask(VMEM_ACCESS, VMEM_READ_ACCESS, ...)` instead of a series of shift/ors that make it hard to see the `,` separating the entries?

If you don't want to do it that's fine, I might do it later myself then.

https://github.com/llvm/llvm-project/pull/77438