[llvm] [AMDGPU] Add option to bias SGPR allocation to reduce read hazards (PR #129869)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 15 21:55:46 PDT 2025
https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/129869
>From 7ad9eee33f443f8c827cd07ba343fe82d698f385 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 5 Dec 2024 16:10:35 +0900
Subject: [PATCH 1/3] [AMDGPU] Add option to bias SGPR allocation to reduce
read hazards
- Scan for potential hazards in virtual registers before SGPR allocation.
- Use this data to build new allocation order via allocation hints.
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 3 +
.../AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp | 102 ++++++++
.../Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h | 25 ++
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/lib/Target/AMDGPU/SIDefines.h | 1 +
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 162 +++++++++++-
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 2 +
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 4 +
.../CodeGen/AMDGPU/sgpr-hazard-realloc.ll | 242 ++++++++++++++++++
10 files changed, 542 insertions(+), 3 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5a917734e9c74..6f614c6346af5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -549,6 +549,9 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;
+void initializeAMDGPUMarkSGPRHazardRegsLegacyPass(PassRegistry &);
+extern char &AMDGPUMarkSGPRHazardRegsLegacyID;
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
new file mode 100644
index 0000000000000..46dfcbb48e54f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
@@ -0,0 +1,102 @@
+//===- AMDGPUMarkSGPRHazardRegs.cpp - Annotate SGPRs used by VALU ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to mark SGPRs used by VALU.
+/// Marks can be used during register allocation to reduce hazards.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMarkSGPRHazardRegs.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-mark-sgpr-hazard-regs"
+
+namespace {
+
+class AMDGPUMarkSGPRHazardRegs {
+public:
+ AMDGPUMarkSGPRHazardRegs() {}
+ bool run(MachineFunction &MF);
+};
+
+class AMDGPUMarkSGPRHazardRegsLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUMarkSGPRHazardRegsLegacy() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+ return AMDGPUMarkSGPRHazardRegs().run(MF);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+bool AMDGPUMarkSGPRHazardRegs::run(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasVALUReadSGPRHazard())
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ if (!TRI->getSGPRHazardAvoidanceStrategy(MF))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "AMDGPUMarkSGPRHazardRegs: function " << MF.getName()
+ << "\n");
+
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+
+ for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+ Register Reg = Register::index2VirtReg(I);
+ if (MRI->reg_nodbg_empty(Reg))
+ continue;
+ const auto *RC = MRI->getRegClass(Reg);
+ if (!RC || !TRI->isSGPRClass(RC))
+ continue;
+ for (const auto &MO : MRI->reg_nodbg_operands(Reg)) {
+ const MachineInstr &MI = *MO.getParent();
+ if (SIInstrInfo::isVALU(MI) && MO.isUse()) {
+ FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG);
+ break;
+ }
+ }
+ }
+
+ return true;
+}
+
+INITIALIZE_PASS(AMDGPUMarkSGPRHazardRegsLegacy, DEBUG_TYPE,
+ "AMDGPU Mark Hazard SGPRs", false, false)
+
+char AMDGPUMarkSGPRHazardRegsLegacy::ID = 0;
+
+char &llvm::AMDGPUMarkSGPRHazardRegsLegacyID =
+ AMDGPUMarkSGPRHazardRegsLegacy::ID;
+
+PreservedAnalyses
+AMDGPUMarkSGPRHazardRegsPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ AMDGPUMarkSGPRHazardRegs().run(MF);
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
new file mode 100644
index 0000000000000..89905ceb1185d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
@@ -0,0 +1,25 @@
+//===--- AMDGPUMarkSGPRHazardRegs.h -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class AMDGPUMarkSGPRHazardRegsPass
+ : public PassInfoMixin<AMDGPUMarkSGPRHazardRegsPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d59087839b0e1..11f5308e70c68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -23,6 +23,7 @@
#include "AMDGPUIGroupLP.h"
#include "AMDGPUISelDAGToDAG.h"
#include "AMDGPUMacroFusion.h"
+#include "AMDGPUMarkSGPRHazardRegs.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUPreloadKernArgProlog.h"
#include "AMDGPURemoveIncompatibleFunctions.h"
@@ -567,6 +568,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
+ initializeAMDGPUMarkSGPRHazardRegsLegacyPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -1667,6 +1669,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
addPass(&GCNPreRALongBranchRegID);
+ addPass(&AMDGPUMarkSGPRHazardRegsLegacyID);
addPass(createSGPRAllocPass(true));
// Commit allocated register changes. This is mostly necessary because too
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c6d70ee39202e..3c09023088a4d 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -84,6 +84,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUIGroupLP.cpp
AMDGPUMCResourceInfo.cpp
AMDGPUMarkLastScratchLoad.cpp
+ AMDGPUMarkSGPRHazardRegs.cpp
AMDGPUMIRFormatter.cpp
AMDGPUPerfHintAnalysis.cpp
AMDGPUPostLegalizerCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 0f603a43fd626..f46a73801e3c1 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1045,6 +1045,7 @@ namespace VirtRegFlag {
enum Register_Flag : uint8_t {
// Register operand in a whole-wave mode operation.
WWM_REG = 1 << 0,
+ SGPR_HAZARD_REG = 1 << 1
};
} // namespace VirtRegFlag
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e41189adfb46f..0f088c044f177 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -11,14 +11,15 @@
//
//===----------------------------------------------------------------------===//
+#include "SIRegisterInfo.h"
#include "AMDGPU.h"
#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -35,6 +36,10 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
cl::ReallyHidden,
cl::init(true));
+static cl::opt<unsigned> SGPRHazardAvoidanceStrategy(
+ "amdgpu-sgpr-hazard-regalloc", cl::init(0), cl::ReallyHidden,
+ cl::desc("Register allocation strategy to reduce SGPR read hazards"));
+
std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
@@ -3840,9 +3845,152 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
return false;
}
default:
- return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
- VRM);
+ break;
+ }
+
+ bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
+ VirtReg, Order, Hints, MF, VRM, Matrix);
+ if (!VRM)
+ return BaseImplRetVal;
+
+ // Only use hinting to reduce SGPR read hazards when required.
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasVALUReadSGPRHazard())
+ return BaseImplRetVal;
+
+ // Only treat SGPRs
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ const auto *RC = MRI.getRegClass(VirtReg);
+ if (!isSGPRClass(RC))
+ return BaseImplRetVal;
+
+ const unsigned Strategy = getSGPRHazardAvoidanceStrategy(MF);
+ if (!Strategy)
+ return BaseImplRetVal;
+
+ SmallSet<MCPhysReg, 4> CopyHints;
+ CopyHints.insert(Hints.begin(), Hints.end());
+
+ auto AddHint = [&](MCPhysReg PhysReg) {
+ if (CopyHints.contains(PhysReg) || MRI.isReserved(PhysReg))
+ return;
+ Hints.push_back(PhysReg);
+ };
+ auto AddHints = [&](ArrayRef<MCPhysReg> Regs) {
+ for (MCPhysReg PhysReg : Regs)
+ AddHint(PhysReg);
+ };
+
+ // V1: simply reverse allocation order, mean 23% reduction in hazards
+ if (Strategy == 1) {
+ if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ for (MCPhysReg PhysReg : reverse(Order))
+ AddHint(PhysReg);
+ } else {
+ for (MCPhysReg PhysReg : Order)
+ AddHint(PhysReg);
+ }
+ return true;
+ }
+
+ // Build set of current hazard pairs from live matrix
+ auto *LiveUnions = const_cast<LiveRegMatrix *>(Matrix)->getLiveUnions();
+
+ DenseMap<MCPhysReg, unsigned> IntervalCount;
+ std::bitset<64> HazardPairs;
+
+ for (MCPhysReg PhysReg : Order) {
+ SmallSet<const LiveInterval *, 4> Intervals;
+ bool IsHazard = false;
+ for (auto Unit : TRI->regunits(PhysReg)) {
+ LiveIntervalUnion &LIU = LiveUnions[Unit];
+ for (const LiveInterval *LI : LIU.getMap()) {
+ Intervals.insert(LI);
+ if (FuncInfo->checkFlag(LI->reg(),
+ AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ IsHazard = true;
+ // Break here as we only care about interval count for non-hazard regs
+ break;
+ }
+ }
+ if (IsHazard)
+ break;
+ }
+ if (IsHazard) {
+ unsigned PairN = TRI->getEncodingValue(PhysReg) >> 1;
+ if (PairN <= 63)
+ HazardPairs.set(PairN);
+ }
+ IntervalCount[PhysReg] = Intervals.size();
+ }
+
+ // V2: weight the entire order based on hazard free usage, mean 30% reduction
+ // in hazards
+ if (Strategy == 2) {
+ bool VRegIsHazard =
+ FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG);
+ SmallVector<MCPhysReg> NewOrder(Order);
+ std::sort(NewOrder.begin(), NewOrder.end(), [&](MCPhysReg A, MCPhysReg B) {
+ return VRegIsHazard ? IntervalCount[A] < IntervalCount[B]
+ : IntervalCount[B] < IntervalCount[A];
+ });
+ AddHints(NewOrder);
+ return true;
+ }
+
+ // V3: complex partitioning, mean 35% reduction in hazards
+ assert(Strategy == 3);
+
+ // Partition the allocation order based on hazards
+ SmallVector<MCPhysReg> Unallocated, UnallocatedWithHazard;
+ SmallVector<MCPhysReg> Allocated, AllocatedWithHazard;
+
+ for (MCPhysReg PhysReg : Order) {
+ Register VReg = Matrix->getOneVReg(PhysReg);
+ bool HasHazard = false;
+ // XXX: can remove regunit scan for just SGPR32/SGPR64
+ for (auto Unit : TRI->regunits(PhysReg)) {
+ unsigned PairN = TRI->getEncodingValue(Unit) >> 1;
+ if (PairN <= 63 && HazardPairs[PairN]) {
+ HasHazard = true;
+ break;
+ }
+ }
+ if (VReg == MCRegister::NoRegister) {
+ if (HasHazard)
+ UnallocatedWithHazard.push_back(PhysReg);
+ else
+ Unallocated.push_back(PhysReg);
+ } else {
+ if (HasHazard)
+ AllocatedWithHazard.push_back(PhysReg);
+ else
+ Allocated.push_back(PhysReg);
+ }
}
+
+ if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ // Reorder allocations based on usage, so least used will be reused first.
+ // This means least used regs are touched by hazards first.
+ std::sort(Allocated.begin(), Allocated.end(),
+ [&](MCPhysReg A, MCPhysReg B) {
+ return IntervalCount[A] < IntervalCount[B];
+ });
+ // Reverse order of allocations to try to keep hazards away - yes it helps.
+ std::reverse(Unallocated.begin(), Unallocated.end());
+
+ AddHints(AllocatedWithHazard);
+ AddHints(UnallocatedWithHazard);
+ AddHints(Unallocated);
+ AddHints(Allocated);
+ } else {
+ AddHints(Allocated);
+ AddHints(Unallocated);
+ AddHints(UnallocatedWithHazard);
+ AddHints(AllocatedWithHazard);
+ }
+
+ return true;
}
MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
@@ -4064,3 +4212,11 @@ SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
RegFlags.push_back("WWM_REG");
return RegFlags;
}
+
+unsigned SIRegisterInfo::getSGPRHazardAvoidanceStrategy(
+ const MachineFunction &MF) const {
+ if (SGPRHazardAvoidanceStrategy.getNumOccurrences())
+ return SGPRHazardAvoidanceStrategy;
+ return MF.getFunction().getFnAttributeAsParsedInteger(
+ "amdgpu-sgpr-hazard-regalloc", 0);
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a4b135d5e0b59..f67db5b5e59df 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -493,6 +493,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
SmallVector<StringLiteral>
getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
+
+ unsigned getSGPRHazardAvoidanceStrategy(const MachineFunction &MF) const;
};
namespace AMDGPU {
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index dd2ff2e013cc8..8eae74e0dc08a 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -350,6 +350,7 @@
; GCN-O1-NEXT: SI Whole Quad Mode
; GCN-O1-NEXT: SI optimize exec mask operations pre-RA
; GCN-O1-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O1-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Machine Block Frequency Analysis
; GCN-O1-NEXT: Debug Variable Analysis
@@ -660,6 +661,7 @@
; GCN-O1-OPTS-NEXT: SI Whole Quad Mode
; GCN-O1-OPTS-NEXT: SI optimize exec mask operations pre-RA
; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O1-OPTS-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Debug Variable Analysis
@@ -976,6 +978,7 @@
; GCN-O2-NEXT: SI optimize exec mask operations pre-RA
; GCN-O2-NEXT: SI Form memory clauses
; GCN-O2-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O2-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Machine Block Frequency Analysis
; GCN-O2-NEXT: Debug Variable Analysis
@@ -1305,6 +1308,7 @@
; GCN-O3-NEXT: SI optimize exec mask operations pre-RA
; GCN-O3-NEXT: SI Form memory clauses
; GCN-O3-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O3-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Machine Block Frequency Analysis
; GCN-O3-NEXT: Debug Variable Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
new file mode 100644
index 0000000000000..36105f64f11b5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=0 < %s | FileCheck -check-prefix DEF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=1 < %s | FileCheck -check-prefix V1 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=2 < %s | FileCheck -check-prefix V2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=3 < %s | FileCheck -check-prefix V3 %s
+
+define amdgpu_ps float @fadd_f32(float inreg %a, float inreg %b, float %c, float %d, ptr addrspace(1) %out, <4 x i32> inreg %desc) {
+; DEF-LABEL: fadd_f32:
+; DEF: ; %bb.0: ; %entry
+; DEF-NEXT: s_mov_b32 s6, s4
+; DEF-NEXT: s_mov_b32 s4, s2
+; DEF-NEXT: s_add_f32 s2, s0, s1
+; DEF-NEXT: s_sub_f32 s1, s0, s1
+; DEF-NEXT: s_mov_b32 s7, s5
+; DEF-NEXT: s_mov_b32 s5, s3
+; DEF-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DEF-NEXT: v_dual_add_f32 v0, s2, v0 :: v_dual_add_f32 v1, s1, v1
+; DEF-NEXT: v_readfirstlane_b32 s0, v0
+; DEF-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DEF-NEXT: v_readfirstlane_b32 s3, v1
+; DEF-NEXT: v_mul_f32_e32 v4, v0, v1
+; DEF-NEXT: s_and_b32 s0, s0, s3
+; DEF-NEXT: global_store_b32 v[2:3], v4, off
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: s_cmp_lg_u32 s0, 0
+; DEF-NEXT: s_mov_b32 s0, 0
+; DEF-NEXT: s_cbranch_scc0 .LBB0_5
+; DEF-NEXT: ; %bb.1: ; %false
+; DEF-NEXT: s_buffer_load_b32 s3, s[4:7], 0x0
+; DEF-NEXT: s_and_b32 s1, s2, s1
+; DEF-NEXT: v_add_f32_e32 v0, v0, v1
+; DEF-NEXT: s_mov_b32 s8, exec_lo
+; DEF-NEXT: s_wait_kmcnt 0x0
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: s_lshl_b32 s1, s3, s1
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: v_cmp_ne_u32_e32 vcc_lo, s1, v1
+; DEF-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: s_and_not1_b32 s8, s8, s1
+; DEF-NEXT: s_cbranch_scc0 .LBB0_6
+; DEF-NEXT: ; %bb.2: ; %false
+; DEF-NEXT: s_and_b32 exec_lo, exec_lo, s8
+; DEF-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DEF-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; DEF-NEXT: s_cbranch_vccnz .LBB0_4
+; DEF-NEXT: .LBB0_3: ; %true
+; DEF-NEXT: v_mul_f32_e32 v0, v1, v4
+; DEF-NEXT: .LBB0_4: ; %final
+; DEF-NEXT: s_branch .LBB0_7
+; DEF-NEXT: .LBB0_5:
+; DEF-NEXT: ; implicit-def: $vgpr0
+; DEF-NEXT: s_branch .LBB0_3
+; DEF-NEXT: .LBB0_6:
+; DEF-NEXT: s_mov_b32 exec_lo, 0
+; DEF-NEXT: export mrt0 off, off, off, off done
+; DEF-NEXT: s_endpgm
+; DEF-NEXT: .LBB0_7:
+;
+; V1-LABEL: fadd_f32:
+; V1: ; %bb.0: ; %entry
+; V1-NEXT: s_add_f32 s104, s0, s1
+; V1-NEXT: s_sub_f32 s103, s0, s1
+; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; V1-NEXT: v_dual_add_f32 v0, s104, v0 :: v_dual_add_f32 v1, s103, v1
+; V1-NEXT: v_readfirstlane_b32 s0, v0
+; V1-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; V1-NEXT: v_readfirstlane_b32 s1, v1
+; V1-NEXT: v_mul_f32_e32 v4, v0, v1
+; V1-NEXT: s_and_b32 s0, s0, s1
+; V1-NEXT: global_store_b32 v[2:3], v4, off
+; V1-NEXT: s_cmp_lg_u32 s0, 0
+; V1-NEXT: s_mov_b32 s0, 0
+; V1-NEXT: s_cbranch_scc0 .LBB0_5
+; V1-NEXT: ; %bb.1: ; %false
+; V1-NEXT: s_mov_b32 s7, s5
+; V1-NEXT: s_mov_b32 s6, s4
+; V1-NEXT: s_mov_b32 s5, s3
+; V1-NEXT: s_mov_b32 s4, s2
+; V1-NEXT: s_and_b32 s2, s104, s103
+; V1-NEXT: s_buffer_load_b32 s1, s[4:7], 0x0
+; V1-NEXT: v_add_f32_e32 v0, v0, v1
+; V1-NEXT: s_mov_b32 s8, exec_lo
+; V1-NEXT: s_wait_kmcnt 0x0
+; V1-NEXT: s_lshl_b32 vcc_hi, s1, s2
+; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; V1-NEXT: v_cmp_ne_u32_e32 vcc_lo, vcc_hi, v1
+; V1-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo
+; V1-NEXT: s_and_not1_b32 s8, s8, s1
+; V1-NEXT: s_cbranch_scc0 .LBB0_6
+; V1-NEXT: ; %bb.2: ; %false
+; V1-NEXT: s_and_b32 exec_lo, exec_lo, s8
+; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; V1-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; V1-NEXT: s_wait_alu 0xfffe
+; V1-NEXT: s_cbranch_vccnz .LBB0_4
+; V1-NEXT: .LBB0_3: ; %true
+; V1-NEXT: v_mul_f32_e32 v0, v1, v4
+; V1-NEXT: .LBB0_4: ; %final
+; V1-NEXT: s_branch .LBB0_7
+; V1-NEXT: .LBB0_5:
+; V1-NEXT: ; implicit-def: $vgpr0
+; V1-NEXT: s_branch .LBB0_3
+; V1-NEXT: .LBB0_6:
+; V1-NEXT: s_mov_b32 exec_lo, 0
+; V1-NEXT: export mrt0 off, off, off, off done
+; V1-NEXT: s_endpgm
+; V1-NEXT: .LBB0_7:
+;
+; V2-LABEL: fadd_f32:
+; V2: ; %bb.0: ; %entry
+; V2-NEXT: s_add_f32 s62, s0, s1
+; V2-NEXT: s_sub_f32 s61, s0, s1
+; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; V2-NEXT: v_dual_add_f32 v0, s62, v0 :: v_dual_add_f32 v1, s61, v1
+; V2-NEXT: v_readfirstlane_b32 s1, v0
+; V2-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; V2-NEXT: v_readfirstlane_b32 vcc_lo, v1
+; V2-NEXT: v_mul_f32_e32 v4, v0, v1
+; V2-NEXT: s_and_b32 s1, s1, vcc_lo
+; V2-NEXT: global_store_b32 v[2:3], v4, off
+; V2-NEXT: s_cmp_lg_u32 s1, 0
+; V2-NEXT: s_mov_b32 s1, 0
+; V2-NEXT: s_cbranch_scc0 .LBB0_5
+; V2-NEXT: ; %bb.1: ; %false
+; V2-NEXT: s_mov_b32 s55, s5
+; V2-NEXT: s_mov_b32 s54, s4
+; V2-NEXT: s_mov_b32 s53, s3
+; V2-NEXT: s_mov_b32 s52, s2
+; V2-NEXT: v_add_f32_e32 v0, v0, v1
+; V2-NEXT: s_buffer_load_b32 vcc_lo, s[52:55], 0x0
+; V2-NEXT: s_and_b32 s54, s62, s61
+; V2-NEXT: s_mov_b32 s69, exec_lo
+; V2-NEXT: s_wait_kmcnt 0x0
+; V2-NEXT: s_lshl_b32 s67, vcc_lo, s54
+; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; V2-NEXT: v_cmp_ne_u32_e32 vcc_lo, s67, v1
+; V2-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo
+; V2-NEXT: s_and_not1_b32 s69, s69, vcc_lo
+; V2-NEXT: s_cbranch_scc0 .LBB0_6
+; V2-NEXT: ; %bb.2: ; %false
+; V2-NEXT: s_and_b32 exec_lo, exec_lo, s69
+; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; V2-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; V2-NEXT: s_cbranch_vccnz .LBB0_4
+; V2-NEXT: .LBB0_3: ; %true
+; V2-NEXT: v_mul_f32_e32 v0, v1, v4
+; V2-NEXT: .LBB0_4: ; %final
+; V2-NEXT: s_branch .LBB0_7
+; V2-NEXT: .LBB0_5:
+; V2-NEXT: ; implicit-def: $vgpr0
+; V2-NEXT: s_branch .LBB0_3
+; V2-NEXT: .LBB0_6:
+; V2-NEXT: s_mov_b32 exec_lo, 0
+; V2-NEXT: export mrt0 off, off, off, off done
+; V2-NEXT: s_endpgm
+; V2-NEXT: .LBB0_7:
+;
+; V3-LABEL: fadd_f32:
+; V3: ; %bb.0: ; %entry
+; V3-NEXT: s_add_f32 s104, s0, s1
+; V3-NEXT: s_sub_f32 s82, s0, s1
+; V3-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; V3-NEXT: v_dual_add_f32 v0, s104, v0 :: v_dual_add_f32 v1, s82, v1
+; V3-NEXT: v_readfirstlane_b32 s0, v0
+; V3-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; V3-NEXT: v_readfirstlane_b32 s1, v1
+; V3-NEXT: v_mul_f32_e32 v4, v0, v1
+; V3-NEXT: s_and_b32 s0, s0, s1
+; V3-NEXT: global_store_b32 v[2:3], v4, off
+; V3-NEXT: s_cmp_lg_u32 s0, 0
+; V3-NEXT: s_mov_b32 s0, 0
+; V3-NEXT: s_cbranch_scc0 .LBB0_5
+; V3-NEXT: ; %bb.1: ; %false
+; V3-NEXT: s_mov_b32 s7, s5
+; V3-NEXT: s_mov_b32 s6, s4
+; V3-NEXT: s_mov_b32 s5, s3
+; V3-NEXT: s_mov_b32 s4, s2
+; V3-NEXT: v_add_f32_e32 v0, v0, v1
+; V3-NEXT: s_buffer_load_b32 s1, s[4:7], 0x0
+; V3-NEXT: s_and_b32 s4, s104, s82
+; V3-NEXT: s_mov_b32 s8, exec_lo
+; V3-NEXT: s_wait_kmcnt 0x0
+; V3-NEXT: s_lshl_b32 s82, s1, s4
+; V3-NEXT: s_wait_alu 0xfffe
+; V3-NEXT: v_cmp_ne_u32_e32 vcc_lo, s82, v1
+; V3-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo
+; V3-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; V3-NEXT: s_and_not1_b32 s8, s8, s1
+; V3-NEXT: s_cbranch_scc0 .LBB0_6
+; V3-NEXT: ; %bb.2: ; %false
+; V3-NEXT: s_and_b32 exec_lo, exec_lo, s8
+; V3-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; V3-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; V3-NEXT: s_cbranch_vccnz .LBB0_4
+; V3-NEXT: .LBB0_3: ; %true
+; V3-NEXT: v_mul_f32_e32 v0, v1, v4
+; V3-NEXT: .LBB0_4: ; %final
+; V3-NEXT: s_branch .LBB0_7
+; V3-NEXT: .LBB0_5:
+; V3-NEXT: ; implicit-def: $vgpr0
+; V3-NEXT: s_branch .LBB0_3
+; V3-NEXT: .LBB0_6:
+; V3-NEXT: s_mov_b32 exec_lo, 0
+; V3-NEXT: export mrt0 off, off, off, off done
+; V3-NEXT: s_endpgm
+; V3-NEXT: .LBB0_7:
+entry:
+ %s.0 = fadd float %a, %b
+ %s.1 = fsub float %a, %b
+ %v.0 = fadd float %c, %s.0
+ %v.1 = fadd float %d, %s.1
+ %v.2 = fmul float %v.0, %v.1
+ store float %v.2, ptr addrspace(1) %out
+ %tmp.0 = bitcast float %v.0 to i32
+ %tmp.1 = bitcast float %v.1 to i32
+ %tmp.2 = bitcast float %s.0 to i32
+ %tmp.3 = bitcast float %s.1 to i32
+ %s.3 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %tmp.0)
+ %s.4 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %tmp.1)
+ %s.5 = and i32 %s.3, %s.4
+ %s.6 = and i32 %tmp.2, %tmp.3
+ %c.0 = icmp eq i32 %s.5, 0
+ br i1 %c.0, label %true, label %false
+true:
+ %v.3 = fmul float %v.1, %v.2
+ br label %final
+false:
+ %v.4 = fadd float %v.0, %v.1
+ %s.7 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
+ %s.8 = shl i32 %s.7, %s.6
+ %c.1 = icmp ne i32 %tmp.1, %s.8
+ call void @llvm.amdgcn.wqm.demote(i1 %c.1)
+ br label %final
+final:
+ %res = phi float [ %v.4, %false ], [ %v.3, %true ]
+ ret float %res
+}
+
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)
+declare void @llvm.amdgcn.wqm.demote(i1)
>From c6ccbb75f9e1709cb20360b3044e8482946ee1e7 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 16 Jun 2025 12:21:29 +0900
Subject: [PATCH 2/3] - Fold analysis into getAllocationHints
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 3 -
.../AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp | 102 ------------------
.../Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h | 25 -----
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 -
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 -
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 26 +++--
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 4 -
7 files changed, 20 insertions(+), 144 deletions(-)
delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 6f614c6346af5..5a917734e9c74 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -549,9 +549,6 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;
-void initializeAMDGPUMarkSGPRHazardRegsLegacyPass(PassRegistry &);
-extern char &AMDGPUMarkSGPRHazardRegsLegacyID;
-
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
deleted file mode 100644
index 46dfcbb48e54f..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-//===- AMDGPUMarkSGPRHazardRegs.cpp - Annotate SGPRs used by VALU ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file Pass to mark SGPRs used by VALU.
-/// Marks can be used during register allocation to reduce hazards.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUMarkSGPRHazardRegs.h"
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "amdgpu-mark-sgpr-hazard-regs"
-
-namespace {
-
-class AMDGPUMarkSGPRHazardRegs {
-public:
- AMDGPUMarkSGPRHazardRegs() {}
- bool run(MachineFunction &MF);
-};
-
-class AMDGPUMarkSGPRHazardRegsLegacy : public MachineFunctionPass {
-public:
- static char ID;
-
- AMDGPUMarkSGPRHazardRegsLegacy() : MachineFunctionPass(ID) {}
-
- bool runOnMachineFunction(MachineFunction &MF) override {
- if (skipFunction(MF.getFunction()))
- return false;
- return AMDGPUMarkSGPRHazardRegs().run(MF);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
-} // End anonymous namespace.
-
-bool AMDGPUMarkSGPRHazardRegs::run(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasVALUReadSGPRHazard())
- return false;
-
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- if (!TRI->getSGPRHazardAvoidanceStrategy(MF))
- return false;
-
- LLVM_DEBUG(dbgs() << "AMDGPUMarkSGPRHazardRegs: function " << MF.getName()
- << "\n");
-
- const MachineRegisterInfo *MRI = &MF.getRegInfo();
- SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-
- for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
- Register Reg = Register::index2VirtReg(I);
- if (MRI->reg_nodbg_empty(Reg))
- continue;
- const auto *RC = MRI->getRegClass(Reg);
- if (!RC || !TRI->isSGPRClass(RC))
- continue;
- for (const auto &MO : MRI->reg_nodbg_operands(Reg)) {
- const MachineInstr &MI = *MO.getParent();
- if (SIInstrInfo::isVALU(MI) && MO.isUse()) {
- FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG);
- break;
- }
- }
- }
-
- return true;
-}
-
-INITIALIZE_PASS(AMDGPUMarkSGPRHazardRegsLegacy, DEBUG_TYPE,
- "AMDGPU Mark Hazard SGPRs", false, false)
-
-char AMDGPUMarkSGPRHazardRegsLegacy::ID = 0;
-
-char &llvm::AMDGPUMarkSGPRHazardRegsLegacyID =
- AMDGPUMarkSGPRHazardRegsLegacy::ID;
-
-PreservedAnalyses
-AMDGPUMarkSGPRHazardRegsPass::run(MachineFunction &MF,
- MachineFunctionAnalysisManager &MFAM) {
- AMDGPUMarkSGPRHazardRegs().run(MF);
- return PreservedAnalyses::all();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
deleted file mode 100644
index 89905ceb1185d..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
+++ /dev/null
@@ -1,25 +0,0 @@
-//===--- AMDGPUMarkSGPRHazardRegs.h -----------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
-
-#include "llvm/CodeGen/MachinePassManager.h"
-
-namespace llvm {
-
-class AMDGPUMarkSGPRHazardRegsPass
- : public PassInfoMixin<AMDGPUMarkSGPRHazardRegsPass> {
-public:
- PreservedAnalyses run(MachineFunction &MF,
- MachineFunctionAnalysisManager &MFAM);
-};
-
-} // namespace llvm
-
-#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 11f5308e70c68..d59087839b0e1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -23,7 +23,6 @@
#include "AMDGPUIGroupLP.h"
#include "AMDGPUISelDAGToDAG.h"
#include "AMDGPUMacroFusion.h"
-#include "AMDGPUMarkSGPRHazardRegs.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUPreloadKernArgProlog.h"
#include "AMDGPURemoveIncompatibleFunctions.h"
@@ -568,7 +567,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
- initializeAMDGPUMarkSGPRHazardRegsLegacyPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -1669,7 +1667,6 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
addPass(&GCNPreRALongBranchRegID);
- addPass(&AMDGPUMarkSGPRHazardRegsLegacyID);
addPass(createSGPRAllocPass(true));
// Commit allocated register changes. This is mostly necessary because too
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 3c09023088a4d..c6d70ee39202e 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -84,7 +84,6 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUIGroupLP.cpp
AMDGPUMCResourceInfo.cpp
AMDGPUMarkLastScratchLoad.cpp
- AMDGPUMarkSGPRHazardRegs.cpp
AMDGPUMIRFormatter.cpp
AMDGPUPerfHintAnalysis.cpp
AMDGPUPostLegalizerCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 0f088c044f177..4aca1bbc7d135 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3864,10 +3864,26 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
if (!isSGPRClass(RC))
return BaseImplRetVal;
+ // Exit without an avoidance strategy
const unsigned Strategy = getSGPRHazardAvoidanceStrategy(MF);
if (!Strategy)
return BaseImplRetVal;
+ // Register has a hazard if it is SGPR used by VALU
+ DenseMap<Register, bool> HazardRegs;
+ auto HasSGPRHazard = [&MRI, TRI, &HazardRegs](Register Reg) {
+ const auto *RC = MRI.getRegClass(Reg);
+ if (!RC || !TRI->isSGPRClass(RC))
+ return false;
+ if (!HazardRegs.contains(Reg)) {
+ HazardRegs[Reg] = llvm::any_of(
+ MRI.reg_nodbg_operands(Reg), [](const MachineOperand &MO) {
+ return MO.isUse() && SIInstrInfo::isVALU(*MO.getParent());
+ });
+ }
+ return HazardRegs[Reg];
+ };
+
SmallSet<MCPhysReg, 4> CopyHints;
CopyHints.insert(Hints.begin(), Hints.end());
@@ -3883,7 +3899,7 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
// V1: simply reverse allocation order, mean 23% reduction in hazards
if (Strategy == 1) {
- if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ if (HasSGPRHazard(VirtReg)) {
for (MCPhysReg PhysReg : reverse(Order))
AddHint(PhysReg);
} else {
@@ -3906,8 +3922,7 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
LiveIntervalUnion &LIU = LiveUnions[Unit];
for (const LiveInterval *LI : LIU.getMap()) {
Intervals.insert(LI);
- if (FuncInfo->checkFlag(LI->reg(),
- AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ if (HasSGPRHazard(LI->reg())) {
IsHazard = true;
// Break here as we only care about interval count for non-hazard regs
break;
@@ -3927,8 +3942,7 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
// V2: weight the entire order based on hazard free usage, mean 30% reduction
// in hazards
if (Strategy == 2) {
- bool VRegIsHazard =
- FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG);
+ bool VRegIsHazard = HasSGPRHazard(VirtReg);
SmallVector<MCPhysReg> NewOrder(Order);
std::sort(NewOrder.begin(), NewOrder.end(), [&](MCPhysReg A, MCPhysReg B) {
return VRegIsHazard ? IntervalCount[A] < IntervalCount[B]
@@ -3969,7 +3983,7 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
}
}
- if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ if (HasSGPRHazard(VirtReg)) {
// Reorder allocations based on usage, so least used will be reused first.
// This means least used regs are touched by hazards first.
std::sort(Allocated.begin(), Allocated.end(),
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 8eae74e0dc08a..dd2ff2e013cc8 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -350,7 +350,6 @@
; GCN-O1-NEXT: SI Whole Quad Mode
; GCN-O1-NEXT: SI optimize exec mask operations pre-RA
; GCN-O1-NEXT: AMDGPU Pre-RA Long Branch Reg
-; GCN-O1-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Machine Block Frequency Analysis
; GCN-O1-NEXT: Debug Variable Analysis
@@ -661,7 +660,6 @@
; GCN-O1-OPTS-NEXT: SI Whole Quad Mode
; GCN-O1-OPTS-NEXT: SI optimize exec mask operations pre-RA
; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA Long Branch Reg
-; GCN-O1-OPTS-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Debug Variable Analysis
@@ -978,7 +976,6 @@
; GCN-O2-NEXT: SI optimize exec mask operations pre-RA
; GCN-O2-NEXT: SI Form memory clauses
; GCN-O2-NEXT: AMDGPU Pre-RA Long Branch Reg
-; GCN-O2-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Machine Block Frequency Analysis
; GCN-O2-NEXT: Debug Variable Analysis
@@ -1308,7 +1305,6 @@
; GCN-O3-NEXT: SI optimize exec mask operations pre-RA
; GCN-O3-NEXT: SI Form memory clauses
; GCN-O3-NEXT: AMDGPU Pre-RA Long Branch Reg
-; GCN-O3-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Machine Block Frequency Analysis
; GCN-O3-NEXT: Debug Variable Analysis
>From 31d6eb98e61dc332d32a6ce54d2675ea0865ce8c Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 16 Jun 2025 13:55:14 +0900
Subject: [PATCH 3/3] - Remove register flag definition
---
llvm/lib/Target/AMDGPU/SIDefines.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index f46a73801e3c1..0f603a43fd626 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1045,7 +1045,6 @@ namespace VirtRegFlag {
enum Register_Flag : uint8_t {
// Register operand in a whole-wave mode operation.
WWM_REG = 1 << 0,
- SGPR_HAZARD_REG = 1 << 1
};
} // namespace VirtRegFlag
More information about the llvm-commits
mailing list