[llvm] [AMDGPU] Mitigate GFX12 VALU read SGPR hazard (PR #100067)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 23 05:10:52 PDT 2024
================
@@ -2876,22 +2909,269 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
auto NextMI = std::next(MI->getIterator());
// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
- BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
- TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
// SALU write may be s_getpc in a bundle.
- if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
- // Update offsets of any references in the bundle.
- while (NextMI != MI->getParent()->end() &&
- NextMI->isBundledWithPred()) {
- for (auto &Operand : NextMI->operands()) {
- if (Operand.isGlobal())
- Operand.setOffset(Operand.getOffset() + 4);
+ updateGetPCBundle(NewMI);
+
+ return true;
+}
+
+static unsigned baseSGPRNumber(Register Reg, const SIRegisterInfo &TRI) {
+ unsigned RegN = TRI.getEncodingValue(Reg);
+ assert(RegN <= 127);
+ return (RegN >> 1) & 0x3f;
+}
+
+// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
+void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
+ assert(MMF == &MF);
+
+ // Assume non-empty vector means it has already been computed.
+ if (!VALUReadHazardSGPRs.empty())
+ return;
+
+ auto CallingConv = MF.getFunction().getCallingConv();
+ bool IsCallFree =
+ AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
+
+ // Exhaustive search is only viable in non-caller/callee functions where
+ // VALUs will be exposed to the hazard recognizer.
+ UseVALUReadHazardExhaustiveSearch =
+ IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
+ MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
+
+ // Consider all SGPRs hazards if the shader uses function calls or is callee.
+ bool UseVALUUseCache =
+ IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
+ VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
+ if (!UseVALUUseCache)
+ return;
+
+ // Perform a post ordered reverse scan to find VALUs which read an SGPR
+ // before a SALU write to the same SGPR. This provides a reduction in
+ // hazard insertion when all VALU access to an SGPR occurs after its last
+ // SALU write, when compared to a linear scan.
+ const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
+ MachineCycleInfo CI;
+ CI.compute(*MMF);
+
+ for (auto *MBB : post_order(&MF)) {
+ bool InCycle = CI.getCycle(MBB) != nullptr;
+ for (auto &MI : reverse(MBB->instrs())) {
+ bool IsVALU = SIInstrInfo::isVALU(MI);
+ bool IsSALU = SIInstrInfo::isSALU(MI);
+ if (!(IsVALU || IsSALU))
+ continue;
+
+ for (const MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+ Register Reg = Op.getReg();
+ // Only consider implicit operands of VCC.
+ if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
+ Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
+ continue;
+ if (!TRI.isSGPRReg(MRI, Reg))
+ continue;
+ if (TRI.getEncodingValue(Reg) >= SGPR_NULL)
+ continue;
+ unsigned RegN = baseSGPRNumber(Reg, TRI);
+ if (IsVALU && Op.isUse()) {
+ // Note: any access within a cycle must be considered a hazard.
+ if (InCycle || (ReadSGPRs[RegN] && SALUWriteSGPRs[RegN]))
+ VALUReadHazardSGPRs.set(RegN);
+ ReadSGPRs.set(RegN);
+ } else if (IsSALU) {
+ if (Op.isDef())
+ SALUWriteSGPRs.set(RegN);
+ else
+ ReadSGPRs.set(RegN);
+ }
}
- NextMI++;
}
}
+}
+
+bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
+ if (!ST.hasVALUReadSGPRHazard())
+ return false;
+
+ // The hazard sequence is fundamentally three instructions:
+ // 1. VALU reads SGPR
+ // 2. SALU writes SGPR
+ // 3. VALU/SALU reads SGPR
+ // Try to avoid searching for (1) because the expiry point of the hazard is
+ // indeterminate; however, the hazard between (2) and (3) can expire if the
+ // gap contains sufficient SALU instructions with no usage of SGPR from (1).
+ // Note: SGPRs must be considered as 64-bit pairs as hazard exists
+ // even if individual SGPRs are accessed.
+
+ bool MIIsSALU = SIInstrInfo::isSALU(*MI);
+ bool MIIsVALU = SIInstrInfo::isVALU(*MI);
+ if (!(MIIsSALU || MIIsVALU))
+ return false;
+
+ // Avoid expensive search when compile time is priority by
+ // mitigating every SALU which writes an SGPR.
+ if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
+ if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
+ return false;
+
+ const MachineOperand *SDSTOp =
+ TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
+ if (!SDSTOp || !SDSTOp->isReg())
+ return false;
+
+ const Register HazardReg = SDSTOp->getReg();
+ if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
+ HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
+ return false;
+
+ // Add s_wait_alu sa_sdst(0) after SALU write.
+ auto NextMI = std::next(MI->getIterator());
+ auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+
+ // SALU write may be s_getpc in a bundle.
+ updateGetPCBundle(NewMI);
+
+ return true;
+ }
+
+ // Pre-compute set of SGPR pairs read by VALUs.
+ // Note: pass mutable pointer to MachineFunction for CycleInfo.
+ computeVALUHazardSGPRs(MI->getMF());
+
+ // If no VALUs hazard SGPRs exist then nothing to do.
+ if (VALUReadHazardSGPRs.none())
+ return false;
+
+ // All SGPR writes before a call/return must be flushed as the callee/caller
+ // will not will not see the hazard chain, i.e. (2) to (3) described above.
+ const bool IsSetPC = (MI->getOpcode() == AMDGPU::S_SETPC_B64 ||
+ MI->getOpcode() == AMDGPU::S_SETPC_B64_return ||
+ MI->getOpcode() == AMDGPU::S_SWAPPC_B64 ||
+ MI->getOpcode() == AMDGPU::S_CALL_B64);
+
+ // Collect all SGPR sources for MI which are read by a VALU.
+ const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ SmallSet<Register, 4> SGPRsUsed;
+
+ if (!IsSetPC) {
+ for (const MachineOperand &Op : MI->all_uses()) {
+ Register OpReg = Op.getReg();
+
+ // Only consider VCC implicit uses on VALUs.
+ // The only expected SALU implicit access is SCC which is no hazard.
+ if (MIIsSALU && Op.isImplicit())
+ continue;
+
+ if (!TRI.isSGPRReg(MRI, OpReg))
+ continue;
+
+ // Ignore special purposes registers such as NULL, EXEC, and M0.
+ if (TRI.getEncodingValue(OpReg) >= SGPR_NULL)
+ continue;
+
+ unsigned RegN = baseSGPRNumber(OpReg, TRI);
+ if (!VALUReadHazardSGPRs[RegN])
+ continue;
+
+ SGPRsUsed.insert(OpReg);
+ }
+
+ // No SGPRs -> nothing to do.
+ if (SGPRsUsed.empty())
+ return false;
+ }
+
+ // A hazard is any SALU which writes one of the SGPRs read by MI.
+ auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
+ if (!SIInstrInfo::isSALU(I))
+ return false;
+ // Ensure SGPR flush before call/return by conservatively assuming every
+ // SALU writes an SGPR.
+ if (IsSetPC && I.getNumDefs() > 0)
+ return true;
+ // Check for any register writes.
+ return llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
----------------
arsenm wrote:
Don't need llvm::
https://github.com/llvm/llvm-project/pull/100067
More information about the llvm-commits
mailing list