[llvm] [AMDGPU] Rewrite GFX12 SGPR hazard handling to dedicated pass (PR #118750)

Wed Dec 4 23:07:24 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-globalisel

Author: Carl Ritson (perlfu)

<details>
<summary>Changes</summary>

- Algorithm operates over whole IR to attempt to minimize waits.
- Add support for VALU->VALU SGPR hazards via VA_SDST/VA_VCC.

---

Patch is 871.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118750.diff


87 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+3) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+3) 
- (added) llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp (+487) 
- (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1) 
- (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (-274) 
- (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h (-4) 
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+36) 
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+18) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll (-8) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll (-8) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll (+6) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll (+1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll (+12) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+32-15) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+11-19) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+40-44) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+9-16) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+9-16) 
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+12-40) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+23-55) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+23-55) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+41-36) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+63-62) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+63-62) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+61-76) 
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+24-97) 
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum3.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/fminimum3.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+27-32) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+47-57) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+47-57) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+47-71) 
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+16-2) 
- (modified) llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll (+5-10) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+1-7) 
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+34-11) 
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+5) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll (-6) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll (+8-16) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll (+2-3) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+52-12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll (+16) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll (+3-6) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll (+1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll (+6-12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll (+6-12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll (+6-12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll (+4-9) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll (+4-9) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.mulo.ll (+22-7) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+3-10) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (-1) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (-1) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+12-22) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+20-26) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+20-26) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+20-36) 
- (modified) llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll (-2) 
- (modified) llvm/test/CodeGen/AMDGPU/mad_64_32.ll (+17-7) 
- (modified) llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll (-6) 
- (modified) llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll (-3) 
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.ll (+18-8) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll (+2-5) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll (+8-16) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (+8-16) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll (-2) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll (-4) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll (-4) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll (+2-6) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll (+2-6) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll (-1) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll (-4) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (-4) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll (+1) 
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.ll (+18-8) 
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-flat.ll (+62-6) 
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-global.ll (+62-6) 
- (modified) llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll (+22-11) 
- (modified) llvm/test/CodeGen/AMDGPU/s-barrier.ll (+12-32) 
- (modified) llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll (-1) 
- (modified) llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll (+82-12) 
- (modified) llvm/test/CodeGen/AMDGPU/sub.ll (+4) 
- (modified) llvm/test/CodeGen/AMDGPU/v_swap_b16.ll (-2) 
- (modified) llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir (+724-532) 
- (modified) llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir (+2-4) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index b9769a1baf4d17..b922d2083c0b8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -459,6 +459,9 @@ void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
 void initializeGCNRewritePartialRegUsesPass(llvm::PassRegistry &);
 extern char &GCNRewritePartialRegUsesID;
 
+void initializeAMDGPUWaitSGPRHazardsPass(PassRegistry &);
+extern char &AMDGPUWaitSGPRHazardsID;
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6e2eb254ff60c6..a88f724033e5b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -535,6 +535,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeGCNPreRALongBranchRegPass(*PR);
   initializeGCNRewritePartialRegUsesPass(*PR);
   initializeGCNRegPressurePrinterPass(*PR);
+  initializeAMDGPUWaitSGPRHazardsPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -1629,6 +1630,8 @@ void GCNPassConfig::addPreEmitPass() {
   // cases.
   addPass(&PostRAHazardRecognizerID);
 
+  addPass(&AMDGPUWaitSGPRHazardsID);
+
   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
     addPass(&AMDGPUInsertDelayAluID);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
new file mode 100644
index 00000000000000..764953960a0d51
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
@@ -0,0 +1,487 @@
+//===- AMDGPUWaitSGPRHazards.cpp - Insert waits for SGPR read hazards -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_wait_alu instructions to mitigate SGPR read hazards on GFX12.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SetVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-wait-sgpr-hazards"
+
+static cl::opt<bool> GlobalEnableSGPRHazardWaits(
+    "amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden,
+    cl::desc("Enable required s_wait_alu on SGPR hazards"));
+
+static cl::opt<bool> GlobalCullSGPRHazardsOnFunctionBoundary(
+    "amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden,
+    cl::desc("Cull hazards on function boundaries"));
+
+static cl::opt<bool>
+    GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull",
+                                   cl::init(false), cl::Hidden,
+                                   cl::desc("Cull hazards on memory waits"));
+
+static cl::opt<unsigned> GlobalCullSGPRHazardsMemWaitThreshold(
+    "amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden,
+    cl::desc("Number of tracked SGPRs before initiating hazard cull on memory "
+             "wait"));
+
+namespace {
+
+class AMDGPUWaitSGPRHazards : public MachineFunctionPass {
+public:
+  static char ID;
+
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  bool Wave64;
+
+  bool EnableSGPRHazardWaits;
+  bool CullSGPRHazardsOnFunctionBoundary;
+  bool CullSGPRHazardsAtMemWait;
+  unsigned CullSGPRHazardsMemWaitThreshold;
+
+  AMDGPUWaitSGPRHazards() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  // Return the numeric ID 0-127 for a given SGPR.
+  static std::optional<unsigned> sgprNumber(Register Reg,
+                                            const SIRegisterInfo &TRI) {
+    switch (Reg) {
+    case AMDGPU::M0:
+    case AMDGPU::EXEC:
+    case AMDGPU::EXEC_LO:
+    case AMDGPU::EXEC_HI:
+    case AMDGPU::SGPR_NULL:
+    case AMDGPU::SGPR_NULL64:
+      return {};
+    default:
+      break;
+    }
+    unsigned RegN = TRI.getEncodingValue(Reg);
+    if (RegN > 127)
+      return {};
+    return RegN;
+  }
+
+  static inline bool IsVCC(Register Reg) {
+    return (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO ||
+            Reg == AMDGPU::VCC_HI);
+  }
+
+  // Adjust global offsets for instructions bundled with S_GETPC_B64 after
+  // insertion of a new instruction.
+  static void updateGetPCBundle(MachineInstr *NewMI) {
+    if (!NewMI->isBundled())
+      return;
+
+    // Find start of bundle.
+    auto I = NewMI->getIterator();
+    while (I->isBundledWithPred())
+      I--;
+    if (I->isBundle())
+      I++;
+
+    // Bail if this is not an S_GETPC bundle.
+    if (I->getOpcode() != AMDGPU::S_GETPC_B64)
+      return;
+
+    // Update offsets of any references in the bundle.
+    const unsigned NewBytes = 4;
+    assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+           "Unexpected instruction insertion in bundle");
+    auto NextMI = std::next(NewMI->getIterator());
+    auto End = NewMI->getParent()->end();
+    while (NextMI != End && NextMI->isBundledWithPred()) {
+      for (auto &Operand : NextMI->operands()) {
+        if (Operand.isGlobal())
+          Operand.setOffset(Operand.getOffset() + NewBytes);
+      }
+      NextMI++;
+    }
+  }
+
+  struct HazardState {
+    static constexpr unsigned None = 0;
+    static constexpr unsigned SALU = (1 << 0);
+    static constexpr unsigned VALU = (1 << 1);
+
+    std::bitset<64> Tracked;      // SGPR banks ever read by VALU
+    std::bitset<128> SALUHazards; // SGPRs with uncommitted values from SALU
+    std::bitset<128> VALUHazards; // SGPRs with uncommitted values from VALU
+    unsigned VCCHazard = None;    // Source of current VCC writes
+    bool ActiveFlat = false;      // Has unwaited flat instructions
+
+    bool merge(const HazardState &RHS) {
+      HazardState Orig(*this);
+
+      Tracked |= RHS.Tracked;
+      SALUHazards |= RHS.SALUHazards;
+      VALUHazards |= RHS.VALUHazards;
+      VCCHazard |= RHS.VCCHazard;
+      ActiveFlat |= RHS.ActiveFlat;
+
+      return (*this != Orig);
+    }
+
+    bool operator==(const HazardState &RHS) const {
+      return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards &&
+             VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard &&
+             ActiveFlat == RHS.ActiveFlat;
+    }
+    bool operator!=(const HazardState &RHS) const { return !(*this == RHS); }
+  };
+
+  struct BlockHazardState {
+    HazardState In;
+    HazardState Out;
+  };
+
+  DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState;
+
+  static constexpr unsigned WAVE32_NOPS = 4;
+  static constexpr unsigned WAVE64_NOPS = 8;
+
+  void insertHazardCull(MachineBasicBlock &MBB,
+                        MachineBasicBlock::instr_iterator &MI) {
+    assert(!MI->isBundled());
+    unsigned Count = Wave64 ? WAVE64_NOPS : WAVE32_NOPS;
+    while (Count--)
+      BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP));
+  }
+
+  bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
+    enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };
+
+    HazardState State = BlockState[&MBB].In;
+    SmallSet<Register, 8> SeenRegs;
+    bool Emitted = false;
+    unsigned DsNops = 0;
+
+    for (MachineBasicBlock::instr_iterator MI = MBB.instr_begin(),
+                                           E = MBB.instr_end();
+         MI != E; ++MI) {
+      // Clear tracked SGPRs if sufficient DS_NOPs occur
+      if (MI->getOpcode() == AMDGPU::DS_NOP) {
+        if (++DsNops >= (Wave64 ? WAVE64_NOPS : WAVE32_NOPS))
+          State.Tracked.reset();
+        continue;
+      }
+      DsNops = 0;
+
+      // Snoop FLAT instructions to avoid adding culls before scratch/lds loads.
+      // Culls could be disproportionate in cost to load time.
+      if (SIInstrInfo::isFLAT(*MI) && !SIInstrInfo::isFLATGlobal(*MI))
+        State.ActiveFlat = true;
+
+      // SMEM or VMEM clears hazards
+      if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSMRD(*MI)) {
+        State.VCCHazard = HazardState::None;
+        State.SALUHazards.reset();
+        State.VALUHazards.reset();
+        continue;
+      }
+
+      // Existing S_WAITALU can clear hazards
+      if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
+        unsigned int Mask = MI->getOperand(0).getImm();
+        if (AMDGPU::DepCtr::decodeFieldVaVcc(Mask) == 0)
+          State.VCCHazard &= ~HazardState::VALU;
+        if (AMDGPU::DepCtr::decodeFieldSaSdst(Mask) == 0) {
+          State.SALUHazards.reset();
+          State.VCCHazard &= ~HazardState::SALU;
+        }
+        if (AMDGPU::DepCtr::decodeFieldVaSdst(Mask) == 0)
+          State.VALUHazards.reset();
+        continue;
+      }
+
+      // Snoop counter waits to insert culls
+      if (CullSGPRHazardsAtMemWait &&
+          (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT ||
+           MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT ||
+           MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) &&
+          (MI->getOperand(0).isImm() && MI->getOperand(0).getImm() == 0) &&
+          (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) {
+        if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) {
+          State.ActiveFlat = false;
+        } else {
+          State.Tracked.reset();
+          if (Emit)
+            insertHazardCull(MBB, MI);
+          continue;
+        }
+      }
+
+      // Process only VALUs and SALUs
+      bool IsVALU = SIInstrInfo::isVALU(*MI);
+      bool IsSALU = SIInstrInfo::isSALU(*MI);
+      if (!IsVALU && !IsSALU)
+        continue;
+
+      unsigned Wait = 0;
+
+      auto processOperand = [&](const MachineOperand &Op, bool IsUse) {
+        if (!Op.isReg())
+          return;
+        Register Reg = Op.getReg();
+        assert(!Op.getSubReg());
+        // Only consider implicit operands of VCC.
+        if (Op.isImplicit() && !IsVCC(Reg))
+          return;
+        if (!TRI->isSGPRReg(*MRI, Reg))
+          return;
+
+        // Only visit each register once
+        if (!SeenRegs.insert(Reg).second)
+          return;
+
+        auto RegNumber = sgprNumber(Reg, *TRI);
+        if (!RegNumber)
+          return;
+
+        // Track SGPRs by pair -- numeric ID of an 64b SGPR pair.
+        // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
+        unsigned RegN = *RegNumber;
+        unsigned PairN = (RegN >> 1) & 0x3f;
+
+        // Read/write of untracked register is safe; but must record any new
+        // reads.
+        if (!State.Tracked[PairN]) {
+          if (IsVALU && IsUse)
+            State.Tracked.set(PairN);
+          return;
+        }
+
+        uint8_t SGPRCount =
+            AMDGPU::getRegBitWidth(*TRI->getRegClassForReg(*MRI, Reg)) / 32;
+
+        if (IsUse) {
+          // SALU reading SGPR clears VALU hazards
+          if (IsSALU) {
+            if (IsVCC(Reg)) {
+              if (State.VCCHazard & HazardState::VALU)
+                State.VCCHazard = HazardState::None;
+            } else {
+              State.VALUHazards.reset();
+            }
+          }
+          // Compute required waits
+          for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
+            Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0;
+            Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0;
+          }
+          if (IsVCC(Reg) && State.VCCHazard) {
+            // Note: it's possible for both SALU and VALU to exist if VCC
+            // was updated differently by merged predecessors.
+            if (State.VCCHazard & HazardState::SALU)
+              Wait |= WA_SALU;
+            if (State.VCCHazard & HazardState::VALU)
+              Wait |= WA_VCC;
+          }
+        } else {
+          // Update hazards
+          if (IsVCC(Reg)) {
+            State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU;
+          } else {
+            for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
+              if (IsSALU)
+                State.SALUHazards.set(RegN + RegIdx);
+              else
+                State.VALUHazards.set(RegN + RegIdx);
+            }
+          }
+        }
+      };
+
+      const bool IsSetPC = (MI->isCall() || MI->isReturn() ||
+                            MI->getOpcode() == AMDGPU::S_SETPC_B64) &&
+                           !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
+                             MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
+
+      if (IsSetPC) {
+        // All SGPR writes before a call/return must be flushed as the
+        // callee/caller will not will not see the hazard chain.
+        if (State.VCCHazard & HazardState::VALU)
+          Wait |= WA_VCC;
+        if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU))
+          Wait |= WA_SALU;
+        if (State.VALUHazards.any())
+          Wait |= WA_VALU;
+        if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) {
+          State.Tracked.reset();
+          if (Emit)
+            insertHazardCull(MBB, MI);
+        }
+      } else {
+        // Process uses to determine required wait.
+        SeenRegs.clear();
+        for (const MachineOperand &Op : MI->all_uses())
+          processOperand(Op, true);
+      }
+
+      // Apply wait
+      if (Wait) {
+        unsigned Mask = 0xffff;
+        if (Wait & WA_VCC) {
+          State.VCCHazard &= ~HazardState::VALU;
+          Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
+        }
+        if (Wait & WA_SALU) {
+          State.SALUHazards.reset();
+          State.VCCHazard &= ~HazardState::SALU;
+          Mask = AMDGPU::DepCtr::encodeFieldSaSdst(Mask, 0);
+        }
+        if (Wait & WA_VALU) {
+          State.VALUHazards.reset();
+          Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0);
+        }
+        if (Emit) {
+          auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(),
+                               TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+                           .addImm(Mask);
+          updateGetPCBundle(NewMI);
+          Emitted = true;
+        }
+      }
+
+      // On return from a call SGPR state is unknown, so all potential hazards.
+      if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary)
+        State.Tracked.set();
+
+      // Update hazards based on defs.
+      SeenRegs.clear();
+      for (const MachineOperand &Op : MI->all_defs())
+        processOperand(Op, false);
+    }
+
+    bool Changed = State != BlockState[&MBB].Out;
+    if (Emit) {
+      assert(!Changed && "Hazard state should not change on emit pass");
+      return Emitted;
+    }
+    if (Changed)
+      BlockState[&MBB].Out = State;
+    return Changed;
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    if (!ST.hasVALUReadSGPRHazard())
+      return false;
+
+    // Parse settings
+    EnableSGPRHazardWaits = GlobalEnableSGPRHazardWaits;
+    CullSGPRHazardsOnFunctionBoundary = GlobalCullSGPRHazardsOnFunctionBoundary;
+    CullSGPRHazardsAtMemWait = GlobalCullSGPRHazardsAtMemWait;
+    CullSGPRHazardsMemWaitThreshold = GlobalCullSGPRHazardsMemWaitThreshold;
+
+    if (!GlobalEnableSGPRHazardWaits.getNumOccurrences())
+      EnableSGPRHazardWaits = MF.getFunction().getFnAttributeAsParsedInteger(
+          "amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits);
+    if (!GlobalCullSGPRHazardsOnFunctionBoundary.getNumOccurrences())
+      CullSGPRHazardsOnFunctionBoundary =
+          MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-boundary-cull");
+    if (!GlobalCullSGPRHazardsAtMemWait.getNumOccurrences())
+      CullSGPRHazardsAtMemWait =
+          MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-mem-wait-cull");
+    if (!GlobalCullSGPRHazardsMemWaitThreshold.getNumOccurrences())
+      CullSGPRHazardsMemWaitThreshold =
+          MF.getFunction().getFnAttributeAsParsedInteger(
+              "amdgpu-sgpr-hazard-mem-wait-cull-threshold",
+              CullSGPRHazardsMemWaitThreshold);
+
+    // Bail if disabled
+    if (!EnableSGPRHazardWaits)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "AMDGPUWaitSGPRHazards running on " << MF.getName()
+                      << "\n");
+
+    TII = ST.getInstrInfo();
+    TRI = ST.getRegisterInfo();
+    MRI = &(MF.getRegInfo());
+    Wave64 = ST.isWave64();
+
+    auto CallingConv = MF.getFunction().getCallingConv();
+    if (!AMDGPU::isEntryFunctionCC(CallingConv) && !MF.empty() &&
+        !CullSGPRHazardsOnFunctionBoundary) {
+      // Callee must consider all SGPRs as tracked.
+      LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n");
+      MachineBasicBlock &EntryBlock = MF.front();
+      BlockState[&EntryBlock].In.Tracked.set();
+    }
+
+    // Calculate the hazard state for each basic block.
+    // Iterate until a fixed point is reached.
+    // Fixed point is guaranteed as merge function only ever increases
+    // the hazard set, and all backedges will cause a merge.
+    //
+    // Note: we have to take care of the entry block as this technically
+    // has an edge from outside the function. Failure to treat this as
+    // a merge could prevent fixed point being reached.
+    SetVector<MachineBasicBlock *> Worklist;
+    for (auto &MBB : reverse(MF))
+      Worklist.insert(&MBB);
+    while (!Worklist.empty()) {
+      auto &MBB = *Worklist.pop_back_val();
+      bool Changed = runOnMachineBasicBlock(MBB, false);
+      if (Changed) {
+        // Note: take a copy of state here in case it is reallocated by map
+        HazardState NewState = BlockState[&MBB].Out;
+        // Propagate to all successor blocks
+        for (auto Succ : MBB.successors()) {
+          // We only need to merge hazards at CFG merge points.
+          if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) {
+            if (BlockState[Succ].In != NewState) {
+              BlockState[Succ].In = NewState;
+              Worklist.insert(Succ);
+            }
+          } else if (BlockState[Succ].In.merge(NewState)) {
+            Worklist.insert(Succ);
+          }
+        }
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n");
+
+    // Final to emit wait instructions.
+    bool Changed = false;
+    for (auto &MBB : MF)
+      Changed |= runOnMachineBasicBlock(MBB, true);
+
+    BlockState.clear();
+    return Changed;
+  }
+};
+
+} // namespace
+
+char AMDGPUWaitSGPRHazards::ID = 0;
+
+char &llvm::AMDGPUWaitSGPRHazardsID = AMDGPUWaitSGPRHazards::ID;
+
+INITIALIZE_PASS(AMDGPUWaitSGPRHazards, DEBUG_TYPE,
+                "AMDGPU Insert waits for SGPR read hazards", false, false)
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 68d141e338a882..5da40e428ec177 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -106,6 +106,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUTargetMachine.cpp
   AMDGPUTargetObjectFile.cpp
   AMDGPUTargetTransformInfo.cpp
+  AMDGPUWaitSGPRHazards.cpp
   AMDGPUUnifyDivergentExitNodes.cpp
   AMDGPUUnifyMetadata.cpp
   R600MachineCFGStructurizer.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index ecf03b14143ee3..e4c130e0362e24 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -45,10 +45,6 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
                      cl::desc("Fill a percentage of the latency between "
                               "neighboring MFMA with s_nops."));
 
-static cl::opt<unsigned> MaxExhaustiveHazardSearch(
-    "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
-    cl::desc("Maximum function size for exhausive hazard search"));
-
 //===----------------------------------------------------------------------===//
 // Hazard Recognizer Implementation
 //===----------------------------------------------------------------------===//
@@ -60,7 +56,6 @@ GCNHazardRecognizer::GCNHazardRecognizer(const Machine...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/118750