[llvm] [AMDGPU] Mitigate GFX12 VALU read SGPR hazard (PR #100067)

Tue Jul 23 05:10:52 PDT 2024

================
@@ -2876,22 +2909,269 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
   auto NextMI = std::next(MI->getIterator());
 
   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
-  BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
-          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
-      .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+  auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
+                       TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+                   .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
 
   // SALU write may be s_getpc in a bundle.
-  if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
-    // Update offsets of any references in the bundle.
-    while (NextMI != MI->getParent()->end() &&
-           NextMI->isBundledWithPred()) {
-      for (auto &Operand : NextMI->operands()) {
-        if (Operand.isGlobal())
-          Operand.setOffset(Operand.getOffset() + 4);
+  updateGetPCBundle(NewMI);
+
+  return true;
+}
+
+static unsigned baseSGPRNumber(Register Reg, const SIRegisterInfo &TRI) {
+  unsigned RegN = TRI.getEncodingValue(Reg);
+  assert(RegN <= 127);
+  return (RegN >> 1) & 0x3f;
+}
+
+// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
+void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
+  assert(MMF == &MF);
+
+  // Assume non-empty vector means it has already been computed.
+  if (!VALUReadHazardSGPRs.empty())
+    return;
+
+  auto CallingConv = MF.getFunction().getCallingConv();
+  bool IsCallFree =
+      AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
+
+  // Exhaustive search is only viable in non-caller/callee functions where
+  // VALUs will be exposed to the hazard recognizer.
+  UseVALUReadHazardExhaustiveSearch =
+      IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
+      MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
+
+  // Consider all SGPRs hazards if the shader uses function calls or is callee.
+  bool UseVALUUseCache =
+      IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
+  VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
+  if (!UseVALUUseCache)
+    return;
+
+  // Perform a post ordered reverse scan to find VALUs which read an SGPR
+  // before a SALU write to the same SGPR.  This provides a reduction in
+  // hazard insertion when all VALU access to an SGPR occurs after its last
+  // SALU write, when compared to a linear scan.
+  const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
+  MachineCycleInfo CI;
+  CI.compute(*MMF);
+
+  for (auto *MBB : post_order(&MF)) {
+    bool InCycle = CI.getCycle(MBB) != nullptr;
+    for (auto &MI : reverse(MBB->instrs())) {
+      bool IsVALU = SIInstrInfo::isVALU(MI);
+      bool IsSALU = SIInstrInfo::isSALU(MI);
+      if (!(IsVALU || IsSALU))
+        continue;
+
+      for (const MachineOperand &Op : MI.operands()) {
+        if (!Op.isReg())
+          continue;
+        Register Reg = Op.getReg();
+        // Only consider implicit operands of VCC.
+        if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
+                                 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
+          continue;
+        if (!TRI.isSGPRReg(MRI, Reg))
+          continue;
+        if (TRI.getEncodingValue(Reg) >= SGPR_NULL)
+          continue;
+        unsigned RegN = baseSGPRNumber(Reg, TRI);
+        if (IsVALU && Op.isUse()) {
+          // Note: any access within a cycle must be considered a hazard.
+          if (InCycle || (ReadSGPRs[RegN] && SALUWriteSGPRs[RegN]))
+            VALUReadHazardSGPRs.set(RegN);
+          ReadSGPRs.set(RegN);
+        } else if (IsSALU) {
+          if (Op.isDef())
+            SALUWriteSGPRs.set(RegN);
+          else
+            ReadSGPRs.set(RegN);
+        }
       }
-      NextMI++;
     }
   }
+}
+
+bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
+  if (!ST.hasVALUReadSGPRHazard())
+    return false;
+
+  // The hazard sequence is fundamentally three instructions:
+  //   1. VALU reads SGPR
+  //   2. SALU writes SGPR
+  //   3. VALU/SALU reads SGPR
+  // Try to avoid searching for (1) because the expiry point of the hazard is
+  // indeterminate; however, the hazard between (2) and (3) can expire if the
+  // gap contains sufficient SALU instructions with no usage of SGPR from (1).
+  // Note: SGPRs must be considered as 64-bit pairs as hazard exists
+  // even if individual SGPRs are accessed.
+
+  bool MIIsSALU = SIInstrInfo::isSALU(*MI);
+  bool MIIsVALU = SIInstrInfo::isVALU(*MI);
+  if (!(MIIsSALU || MIIsVALU))
+    return false;
+
+  // Avoid expensive search when compile time is priority by
+  // mitigating every SALU which writes an SGPR.
+  if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
+    if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
+      return false;
+
+    const MachineOperand *SDSTOp =
+        TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
+    if (!SDSTOp || !SDSTOp->isReg())
+      return false;
+
+    const Register HazardReg = SDSTOp->getReg();
+    if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
+        HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
+      return false;
+
+    // Add s_wait_alu sa_sdst(0) after SALU write.
+    auto NextMI = std::next(MI->getIterator());
+    auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
+                         TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+                     .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+
+    // SALU write may be s_getpc in a bundle.
+    updateGetPCBundle(NewMI);
+
+    return true;
+  }
+
+  // Pre-compute set of SGPR pairs read by VALUs.
+  // Note: pass mutable pointer to MachineFunction for CycleInfo.
+  computeVALUHazardSGPRs(MI->getMF());
+
+  // If no VALUs hazard SGPRs exist then nothing to do.
+  if (VALUReadHazardSGPRs.none())
+    return false;
+
+  // All SGPR writes before a call/return must be flushed as the callee/caller
+  // will not will not see the hazard chain, i.e. (2) to (3) described above.
+  const bool IsSetPC = (MI->getOpcode() == AMDGPU::S_SETPC_B64 ||
+                        MI->getOpcode() == AMDGPU::S_SETPC_B64_return ||
+                        MI->getOpcode() == AMDGPU::S_SWAPPC_B64 ||
+                        MI->getOpcode() == AMDGPU::S_CALL_B64);
+
+  // Collect all SGPR sources for MI which are read by a VALU.
+  const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallSet<Register, 4> SGPRsUsed;
+
+  if (!IsSetPC) {
+    for (const MachineOperand &Op : MI->all_uses()) {
+      Register OpReg = Op.getReg();
+
+      // Only consider VCC implicit uses on VALUs.
+      // The only expected SALU implicit access is SCC which is no hazard.
+      if (MIIsSALU && Op.isImplicit())
+        continue;
+
+      if (!TRI.isSGPRReg(MRI, OpReg))
+        continue;
+
+      // Ignore special purposes registers such as NULL, EXEC, and M0.
+      if (TRI.getEncodingValue(OpReg) >= SGPR_NULL)
+        continue;
+
+      unsigned RegN = baseSGPRNumber(OpReg, TRI);
+      if (!VALUReadHazardSGPRs[RegN])
+        continue;
+
+      SGPRsUsed.insert(OpReg);
+    }
+
+    // No SGPRs -> nothing to do.
+    if (SGPRsUsed.empty())
+      return false;
+  }
+
+  // A hazard is any SALU which writes one of the SGPRs read by MI.
+  auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
+    if (!SIInstrInfo::isSALU(I))
+      return false;
+    // Ensure SGPR flush before call/return by conservatively assuming every
+    // SALU writes an SGPR.
+    if (IsSetPC && I.getNumDefs() > 0)
+      return true;
+    // Check for any register writes.
+    return llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
----------------
arsenm wrote:

Don't need llvm:: 

https://github.com/llvm/llvm-project/pull/100067