[llvm] [AMDGPU] Constrain AV->VReg if we do not exceed RP thresholds (PR #150086)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 24 16:59:49 PDT 2025
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/150086
>From b5f00f75c0e28b5d134b7332cbc2149debd173d9 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 18 Jul 2025 16:55:44 -0700
Subject: [PATCH 1/3] [AMDGPU] Constrain AV->VReg if we do not exceed RP
thresholds
Change-Id: I17cb012504946fa9dca88b32548f922e2ce4b7a9
---
.../Target/AMDGPU/GCNPreRAOptimizations.cpp | 49 +++++
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 13 ++
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 3 +
.../Target/AMDGPU/SIMachineFunctionInfo.cpp | 2 +
.../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 11 +
.../AMDGPU/schedule-reconstrain-avgpr.mir | 203 ++++++++++++++++++
.../AMDGPU/long-branch-reg-all-sgpr-used.ll | 2 +
.../AMDGPU/machine-function-info-after-pei.ll | 1 +
...ine-function-info-long-branch-reg-debug.ll | 1 +
.../machine-function-info-long-branch-reg.ll | 1 +
.../AMDGPU/machine-function-info-no-ir.mir | 4 +
.../MIR/AMDGPU/machine-function-info.ll | 4 +
12 files changed, 294 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-reconstrain-avgpr.mir
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9485e4d..3eb199bac95d1 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -34,6 +34,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -54,6 +55,9 @@ class GCNPreRAOptimizationsImpl {
bool processReg(Register Reg);
+ bool reconstrainRegClass(Register Reg, const TargetRegisterClass *NewRC,
+ const GCNSubtarget &ST) const;
+
public:
GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
bool run(MachineFunction &MF);
@@ -225,6 +229,38 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
return true;
}
+bool GCNPreRAOptimizationsImpl::reconstrainRegClass(
+ Register Reg, const TargetRegisterClass *NewRC,
+ const GCNSubtarget &ST) const {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const TargetRegisterClass *OldRC = MRI->getRegClass(Reg);
+ const TargetRegisterClass *ConstrainRC = NewRC;
+
+ // Stop early if there is nothing to do.
+ if (!NewRC || NewRC == OldRC)
+ return false;
+
+ // Accumulate constraints from all uses.
+ for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+ // Apply the effect of the given operand to ConstrainRC.
+ MachineInstr *MI = MO.getParent();
+ unsigned OpNo = &MO - &MI->getOperand(0);
+ ConstrainRC = MI->getRegClassConstraintEffect(OpNo, ConstrainRC, TII, TRI);
+ if (!ConstrainRC)
+ return false;
+ if (MI->isCopy()) {
+ MachineOperand &OtherOp = MI->getOperand(1 - OpNo);
+ if (!OtherOp.isReg())
+ continue;
+
+ if (!TRI->isVGPR(*MRI, OtherOp.getReg()))
+ return false;
+ }
+ }
+ MRI->setRegClass(Reg, ConstrainRC);
+ return true;
+}
+
bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -245,6 +281,10 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
TII = ST.getInstrInfo();
MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ bool ContrainAVGPRs =
+ ST.hasGFX90AInsts() && MFI->getMaxArchVGPRPressure() &&
+ (MFI->getMaxArchVGPRPressure() < ST.getAddressableNumArchVGPRs());
bool Changed = false;
@@ -253,6 +293,15 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
if (!LIS->hasInterval(Reg))
continue;
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+ // If we do not need to use AGPRs to assign AVRegs, it is beneficial
+ // to contrain them to VGPR as this allows for better initial assignment
+ // (based on register bitwidth).
+ if (ContrainAVGPRs && TRI->isVectorSuperClass(RC)) {
+ reconstrainRegClass(Reg, TRI->getEquivalentVGPRClass(RC), ST);
+ continue;
+ }
+
if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) &&
(ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC)))
continue;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index a6553083d722b..63e725b79043f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -941,10 +941,12 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
Pressure.resize(Regions.size());
RegionsWithHighRP.resize(Regions.size());
RegionsWithExcessRP.resize(Regions.size());
+ RegionsWithExcessVGPRRP.resize(Regions.size());
RegionsWithMinOcc.resize(Regions.size());
RegionsWithIGLPInstrs.resize(Regions.size());
RegionsWithHighRP.reset();
RegionsWithExcessRP.reset();
+ RegionsWithExcessVGPRRP.reset();
RegionsWithMinOcc.reset();
RegionsWithIGLPInstrs.reset();
@@ -1263,6 +1265,14 @@ void GCNSchedStage::finalizeGCNRegion() {
// reason that the original schedule is better.
checkScheduling();
+ unsigned MaxArchVGPR = 0;
+ for (auto P : DAG.Pressure) {
+ if (P.getArchVGPRNum() > MaxArchVGPR)
+ MaxArchVGPR = P.getArchVGPRNum();
+ }
+
+ MF.getInfo<SIMachineFunctionInfo>()->setMaxArchVGPRPressure(MaxArchVGPR);
+
if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
SavedMutations.swap(DAG.Mutations);
@@ -1331,6 +1341,9 @@ void GCNSchedStage::checkScheduling() {
unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+ if (PressureAfter.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
+ DAG.RegionsWithExcessVGPRRP[RegionIdx] = true;
+
if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
PressureAfter.getAGPRNum() > MaxArchVGPRs ||
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 94cd795bbc8f6..18ce3409cf3fd 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -250,6 +250,9 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// limit. Register pressure in these regions usually will result in spilling.
BitVector RegionsWithExcessRP;
+ // Regions that have VGPR RP which exceed the addressable limit.
+ BitVector RegionsWithExcessVGPRRP;
+
// Regions that has the same occupancy as the latest MinOccupancy
BitVector RegionsWithMinOcc;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 75ce67c00228d..60d90276af043 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -713,6 +713,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
Occupancy(MFI.getOccupancy()),
+ MaxArchVGPRPressure(MFI.getMaxArchVGPRPressure()),
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
@@ -760,6 +761,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
Occupancy = YamlMFI.Occupancy;
+ MaxArchVGPRPressure = YamlMFI.MaxArchVGPRPressure;
IsEntryFunction = YamlMFI.IsEntryFunction;
NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
MemoryBound = YamlMFI.MemoryBound;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 274a60adb8d07..ed10a994ad75c 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -274,6 +274,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
// TODO: 10 may be a better default since it's the maximum.
unsigned Occupancy = 0;
+ unsigned MaxArchVGPRPressure = 0;
SmallVector<StringValue, 2> SpillPhysVGPRS;
SmallVector<StringValue> WWMReservedRegs;
@@ -343,6 +344,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("highBitsOf32BitAddress",
MFI.HighBitsOf32BitAddress, 0u);
YamlIO.mapOptional("occupancy", MFI.Occupancy, 0);
+ YamlIO.mapOptional("maxArchVGPRPressure", MFI.MaxArchVGPRPressure, 0u);
YamlIO.mapOptional("spillPhysVGPRs", MFI.SpillPhysVGPRS);
YamlIO.mapOptional("wwmReservedRegs", MFI.WWMReservedRegs);
YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI);
@@ -512,6 +514,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// Current recorded maximum possible occupancy.
unsigned Occupancy;
+ // The max arch VGPR pressure found during scheduling.
+ unsigned MaxArchVGPRPressure;
+
// Maximum number of dwords that can be clusterred during instruction
// scheduler stage.
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
@@ -1176,6 +1181,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
return MayNeedAGPRs;
}
+ unsigned getMaxArchVGPRPressure() const { return MaxArchVGPRPressure; }
+
+ void setMaxArchVGPRPressure(unsigned NewArchVGPRPressure) {
+ MaxArchVGPRPressure = NewArchVGPRPressure;
+ }
+
// \returns true if a function has a use of AGPRs via inline asm or
// has a call which may use it.
bool mayUseAGPRs(const Function &F) const;
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-reconstrain-avgpr.mir b/llvm/test/CodeGen/AMDGPU/schedule-reconstrain-avgpr.mir
new file mode 100644
index 0000000000000..6abfb49c763dd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-reconstrain-avgpr.mir
@@ -0,0 +1,203 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -run-pass=amdgpu-pre-ra-optimizations -o - %s | FileCheck -check-prefix=UNIFIED %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=amdgpu-pre-ra-optimizations -o - %s | FileCheck -check-prefix=SPLIT %s
+
+---
+name: reconstrain
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
+ frameOffsetReg: '$sgpr32'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr33' }
+ maxArchVGPRPressure: 2
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; UNIFIED-LABEL: name: reconstrain
+ ; UNIFIED: liveins: $vgpr0, $vgpr1
+ ; UNIFIED-NEXT: {{ $}}
+ ; UNIFIED-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
+ ; UNIFIED-NEXT: S_ENDPGM 0
+ ;
+ ; SPLIT-LABEL: name: reconstrain
+ ; SPLIT: liveins: $vgpr0, $vgpr1
+ ; SPLIT-NEXT: {{ $}}
+ ; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
+ ; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
+ ; SPLIT-NEXT: S_ENDPGM 0
+ %0:av_64_align2 = IMPLICIT_DEF
+ S_NOP 0, implicit %0
+ S_ENDPGM 0
+...
+
+---
+name: unspecified_yaml
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
+ frameOffsetReg: '$sgpr32'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr33' }
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; UNIFIED-LABEL: name: unspecified_yaml
+ ; UNIFIED: liveins: $vgpr0, $vgpr1
+ ; UNIFIED-NEXT: {{ $}}
+ ; UNIFIED-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
+ ; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
+ ; UNIFIED-NEXT: S_ENDPGM 0
+ ;
+ ; SPLIT-LABEL: name: unspecified_yaml
+ ; SPLIT: liveins: $vgpr0, $vgpr1
+ ; SPLIT-NEXT: {{ $}}
+ ; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
+ ; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
+ ; SPLIT-NEXT: S_ENDPGM 0
+ %0:av_64_align2 = IMPLICIT_DEF
+ S_NOP 0, implicit %0
+ S_ENDPGM 0
+...
+
+---
+name: constrain_highrp
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
+ frameOffsetReg: '$sgpr32'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr33' }
+ maxArchVGPRPressure: 255
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; UNIFIED-LABEL: name: constrain_highrp
+ ; UNIFIED: liveins: $vgpr0, $vgpr1
+ ; UNIFIED-NEXT: {{ $}}
+ ; UNIFIED-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
+ ; UNIFIED-NEXT: S_ENDPGM 0
+ ;
+ ; SPLIT-LABEL: name: constrain_highrp
+ ; SPLIT: liveins: $vgpr0, $vgpr1
+ ; SPLIT-NEXT: {{ $}}
+ ; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
+ ; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
+ ; SPLIT-NEXT: S_ENDPGM 0
+ %0:av_64_align2 = IMPLICIT_DEF
+ S_NOP 0, implicit %0
+ S_ENDPGM 0
+...
+
+---
+name: no_constrain_highrp
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
+ frameOffsetReg: '$sgpr32'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr33' }
+ maxArchVGPRPressure: 256
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; UNIFIED-LABEL: name: no_constrain_highrp
+ ; UNIFIED: liveins: $vgpr0, $vgpr1
+ ; UNIFIED-NEXT: {{ $}}
+ ; UNIFIED-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
+ ; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
+ ; UNIFIED-NEXT: S_ENDPGM 0
+ ;
+ ; SPLIT-LABEL: name: no_constrain_highrp
+ ; SPLIT: liveins: $vgpr0, $vgpr1
+ ; SPLIT-NEXT: {{ $}}
+ ; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
+ ; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
+ ; SPLIT-NEXT: S_ENDPGM 0
+ %0:av_64_align2 = IMPLICIT_DEF
+ S_NOP 0, implicit %0
+ S_ENDPGM 0
+...
+
+---
+name: no_constrain_highrp1
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
+ frameOffsetReg: '$sgpr32'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr33' }
+ maxArchVGPRPressure: 257
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; UNIFIED-LABEL: name: no_constrain_highrp1
+ ; UNIFIED: liveins: $vgpr0, $vgpr1
+ ; UNIFIED-NEXT: {{ $}}
+ ; UNIFIED-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
+ ; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
+ ; UNIFIED-NEXT: S_ENDPGM 0
+ ;
+ ; SPLIT-LABEL: name: no_constrain_highrp1
+ ; SPLIT: liveins: $vgpr0, $vgpr1
+ ; SPLIT-NEXT: {{ $}}
+ ; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
+ ; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
+ ; SPLIT-NEXT: S_ENDPGM 0
+ %0:av_64_align2 = IMPLICIT_DEF
+ S_NOP 0, implicit %0
+ S_ENDPGM 0
+...
+
+---
+name: no_constrain_use
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27'
+ frameOffsetReg: '$sgpr32'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr33' }
+ maxArchVGPRPressure: 0
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; UNIFIED-LABEL: name: no_constrain_use
+ ; UNIFIED: liveins: $vgpr0, $vgpr1
+ ; UNIFIED-NEXT: {{ $}}
+ ; UNIFIED-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
+ ; UNIFIED-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[DEF]]
+ ; UNIFIED-NEXT: S_NOP 0, implicit [[DEF]]
+ ; UNIFIED-NEXT: S_ENDPGM 0
+ ;
+ ; SPLIT-LABEL: name: no_constrain_use
+ ; SPLIT: liveins: $vgpr0, $vgpr1
+ ; SPLIT-NEXT: {{ $}}
+ ; SPLIT-NEXT: [[DEF:%[0-9]+]]:av_64_align2 = IMPLICIT_DEF
+ ; SPLIT-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[DEF]]
+ ; SPLIT-NEXT: S_NOP 0, implicit [[DEF]]
+ ; SPLIT-NEXT: S_ENDPGM 0
+ %0:av_64_align2 = IMPLICIT_DEF
+ INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, %0
+ S_NOP 0, implicit %0
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index b514c49394d21..b82bc3fb8724d 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -39,6 +39,7 @@
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: highBitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 5
+; CHECK-NEXT: maxArchVGPRPressure: 0
; CHECK-NEXT: scavengeFI: '%stack.0'
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
@@ -308,6 +309,7 @@
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: highBitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 5
+; CHECK-NEXT: maxArchVGPRPressure: 0
; CHECK-NEXT: scavengeFI: '%stack.0'
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index fc730f9e88454..9c38f9ef0315e 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -39,6 +39,7 @@
; AFTER-PEI-NEXT: fp64-fp16-output-denormals: true
; AFTER-PEI-NEXT: highBitsOf32BitAddress: 0
; AFTER-PEI-NEXT: occupancy: 5
+; AFTER-PEI-NEXT: maxArchVGPRPressure: 0
; AFTER-PEI-NEXT: scavengeFI: '%stack.3'
; AFTER-PEI-NEXT: vgprForAGPRCopy: ''
; AFTER-PEI-NEXT: sgprForEXECCopy: ''
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 5adef1433079d..cfb44e3c11171 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -40,6 +40,7 @@
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: BitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
+; CHECK-NEXT: maxArchVGPRPressure: 3
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index fa40164aa02f0..343d4ed402296 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -40,6 +40,7 @@
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: BitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
+; CHECK-NEXT: maxArchVGPRPressure: 3
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 24565e4423d04..b49e203a65c4a 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -49,6 +49,7 @@
# FULL-NEXT: fp64-fp16-output-denormals: true
# FULL-NEXT: highBitsOf32BitAddress: 0
# FULL-NEXT: occupancy: 8
+# FULL-NEXT: maxArchVGPRPressure: 0
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
@@ -156,6 +157,7 @@ body: |
# FULL-NEXT: fp64-fp16-output-denormals: true
# FULL-NEXT: highBitsOf32BitAddress: 0
# FULL-NEXT: occupancy: 10
+# FULL-NEXT: maxArchVGPRPressure: 0
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
@@ -234,6 +236,7 @@ body: |
# FULL-NEXT: fp64-fp16-output-denormals: true
# FULL-NEXT: highBitsOf32BitAddress: 0
# FULL-NEXT: occupancy: 10
+# FULL-NEXT: maxArchVGPRPressure: 0
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
@@ -313,6 +316,7 @@ body: |
# FULL-NEXT: fp64-fp16-output-denormals: true
# FULL-NEXT: highBitsOf32BitAddress: 0
# FULL-NEXT: occupancy: 10
+# FULL-NEXT: maxArchVGPRPressure: 0
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index a15271382f37d..c56dfeeab4ac6 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -50,6 +50,7 @@
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: highBitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 8
+; CHECK-NEXT: maxArchVGPRPressure: 2
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
@@ -99,6 +100,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: highBitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
+; CHECK-NEXT: maxArchVGPRPressure: 2
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
@@ -172,6 +174,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: highBitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
+; CHECK-NEXT: maxArchVGPRPressure: 0
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
@@ -227,6 +230,7 @@ define void @function() {
; CHECK-NEXT: fp64-fp16-output-denormals: true
; CHECK-NEXT: highBitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
+; CHECK-NEXT: maxArchVGPRPressure: 0
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
>From 3677b7015f5e0b950285203cddaf3f4e09a45428 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 24 Jul 2025 16:52:27 -0700
Subject: [PATCH 2/3] Delete stale code
Change-Id: I1ebeebd405e773e494ad7242f9a0017f1b5f6013
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 5 -----
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 3 ---
2 files changed, 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 63e725b79043f..8bc24c991a321 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -941,12 +941,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
Pressure.resize(Regions.size());
RegionsWithHighRP.resize(Regions.size());
RegionsWithExcessRP.resize(Regions.size());
- RegionsWithExcessVGPRRP.resize(Regions.size());
RegionsWithMinOcc.resize(Regions.size());
RegionsWithIGLPInstrs.resize(Regions.size());
RegionsWithHighRP.reset();
RegionsWithExcessRP.reset();
- RegionsWithExcessVGPRRP.reset();
RegionsWithMinOcc.reset();
RegionsWithIGLPInstrs.reset();
@@ -1341,9 +1339,6 @@ void GCNSchedStage::checkScheduling() {
unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
- if (PressureAfter.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
- DAG.RegionsWithExcessVGPRRP[RegionIdx] = true;
-
if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
PressureAfter.getAGPRNum() > MaxArchVGPRs ||
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 18ce3409cf3fd..94cd795bbc8f6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -250,9 +250,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// limit. Register pressure in these regions usually will result in spilling.
BitVector RegionsWithExcessRP;
- // Regions that have VGPR RP which exceed the addressable limit.
- BitVector RegionsWithExcessVGPRRP;
-
// Regions that has the same occupancy as the latest MinOccupancy
BitVector RegionsWithMinOcc;
>From a4875189202eef66d42e62b2cd5913499ba66d12 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 24 Jul 2025 16:57:24 -0700
Subject: [PATCH 3/3] Move the max RP calculation to end of stage
Change-Id: If9378e0c3682ef62768756bea8a0315fdd203594
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 8bc24c991a321..4825b549b6796 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1130,6 +1130,15 @@ bool PreRARematStage::initGCNSchedStage() {
}
void GCNSchedStage::finalizeGCNSchedStage() {
+ unsigned MaxArchVGPR = 0;
+ for (auto P : DAG.Pressure) {
+ if (P.getArchVGPRNum() > MaxArchVGPR)
+ MaxArchVGPR = P.getArchVGPRNum();
+ }
+
+ MF.getInfo<SIMachineFunctionInfo>()->setMaxArchVGPRPressure(MaxArchVGPR);
+
+
DAG.finishBlock();
LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
}
@@ -1263,14 +1272,6 @@ void GCNSchedStage::finalizeGCNRegion() {
// reason that the original schedule is better.
checkScheduling();
- unsigned MaxArchVGPR = 0;
- for (auto P : DAG.Pressure) {
- if (P.getArchVGPRNum() > MaxArchVGPR)
- MaxArchVGPR = P.getArchVGPRNum();
- }
-
- MF.getInfo<SIMachineFunctionInfo>()->setMaxArchVGPRPressure(MaxArchVGPR);
-
if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
SavedMutations.swap(DAG.Mutations);
More information about the llvm-commits
mailing list