[llvm] a5cbd2a - Revert "[AMDGPU] Skip register uses in AMDGPUResourceUsageAnalysis (#… (#144039)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 13 03:48:27 PDT 2025
Author: Diana Picus
Date: 2025-06-13T12:48:24+02:00
New Revision: a5cbd2ab0bebc722f836cd3b04dbab691ef9ed2f
URL: https://github.com/llvm/llvm-project/commit/a5cbd2ab0bebc722f836cd3b04dbab691ef9ed2f
DIFF: https://github.com/llvm/llvm-project/commit/a5cbd2ab0bebc722f836cd3b04dbab691ef9ed2f.diff
LOG: Revert "[AMDGPU] Skip register uses in AMDGPUResourceUsageAnalysis (#… (#144039)
…133242)"
This reverts commit 130080fab11cde5efcb338b77f5c3b31097df6e6 because it
causes issues in testcases similar to coalescer_remat.ll [1], i.e. when
we use a VGPR tuple but only write to its lower parts. The high VGPRs
would then not be included in the vgpr_count, and accessing them would
be an out of bounds violation.
[1]
https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
Added:
Modified:
llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/lib/Target/AMDGPU/SIRegisterInfo.h
llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
llvm/test/CodeGen/AMDGPU/code-object-v3.ll
llvm/test/CodeGen/AMDGPU/elf-notes.ll
llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
llvm/test/CodeGen/AMDGPU/hsa.ll
llvm/test/CodeGen/AMDGPU/ipra.ll
llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
llvm/test/CodeGen/AMDGPU/register-count-comments.ll
llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
Removed:
llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
################################################################################
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 3aa8773fa506b..e0a43225e81be 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4263,9 +4263,10 @@ same *vendor-name*.
wavefront for
GFX6-GFX9. A register
is required if it is
- written to, or
+ used explicitly, or
if a higher numbered
- register is written to. This
+ register is used
+ explicitly. This
includes the special
SGPRs for VCC, Flat
Scratch (GFX7-GFX9)
@@ -4283,10 +4284,10 @@ same *vendor-name*.
each work-item for
GFX6-GFX9. A register
is required if it is
- written to, or
+ used explicitly, or
if a higher numbered
- register is
- written to.
+ register is used
+ explicitly.
".agpr_count" integer Required Number of accumulator
registers required by
each work-item for
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d4fea30f21f45..491314daf2d81 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -989,7 +989,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// dispatch registers are function args.
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
- if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
+ if (isShader(F.getCallingConv())) {
bool IsPixelShader =
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
@@ -1060,6 +1060,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
+ } else if (isKernel(F.getCallingConv()) &&
+ MFI->getNumKernargPreloadedSGPRs()) {
+ // Consider cases where the total number of UserSGPRs with trailing
+ // allocated preload SGPRs, is greater than the number of explicitly
+ // referenced SGPRs.
+ const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
+ CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
+ ProgInfo.NumSGPR =
+ AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 7bde59412d905..9a609a1752de0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -137,29 +137,274 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
if (MFI->isStackRealigned())
Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
- Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC);
-
- Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
- Info.NumExplicitSGPR =
- TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
- if (ST.hasMAIInsts())
- Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
-
- // Preloaded registers are written by the hardware, not defined in the
- // function body, so they need special handling.
- if (MFI->isEntryFunction()) {
- Info.NumExplicitSGPR =
- std::max<int32_t>(Info.NumExplicitSGPR, MFI->getNumPreloadedSGPRs());
- Info.NumVGPR = std::max<int32_t>(Info.NumVGPR, MFI->getNumPreloadedVGPRs());
- }
-
- if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
+ Info.UsesVCC =
+ MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+
+ // If there are no calls, MachineRegisterInfo can tell us the used register
+ // count easily.
+ // A tail call isn't considered a call for MachineFrameInfo's purposes.
+ if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
+ Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
+ Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
+ if (ST.hasMAIInsts())
+ Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
return Info;
+ }
+ int32_t MaxVGPR = -1;
+ int32_t MaxAGPR = -1;
+ int32_t MaxSGPR = -1;
Info.CalleeSegmentSize = 0;
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
+ // TODO: Check regmasks? Do they occur anywhere except calls?
+ for (const MachineOperand &MO : MI.operands()) {
+ unsigned Width = 0;
+ bool IsSGPR = false;
+ bool IsAGPR = false;
+
+ if (!MO.isReg())
+ continue;
+
+ Register Reg = MO.getReg();
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::SCC:
+ case AMDGPU::M0:
+ case AMDGPU::M0_LO16:
+ case AMDGPU::M0_HI16:
+ case AMDGPU::SRC_SHARED_BASE_LO:
+ case AMDGPU::SRC_SHARED_BASE:
+ case AMDGPU::SRC_SHARED_LIMIT_LO:
+ case AMDGPU::SRC_SHARED_LIMIT:
+ case AMDGPU::SRC_PRIVATE_BASE_LO:
+ case AMDGPU::SRC_PRIVATE_BASE:
+ case AMDGPU::SRC_PRIVATE_LIMIT_LO:
+ case AMDGPU::SRC_PRIVATE_LIMIT:
+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+ case AMDGPU::SGPR_NULL:
+ case AMDGPU::SGPR_NULL64:
+ case AMDGPU::MODE:
+ continue;
+
+ case AMDGPU::NoRegister:
+ assert(MI.isDebugInstr() &&
+ "Instruction uses invalid noreg register");
+ continue;
+
+ case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
+ case AMDGPU::VCC_LO_LO16:
+ case AMDGPU::VCC_LO_HI16:
+ case AMDGPU::VCC_HI_LO16:
+ case AMDGPU::VCC_HI_HI16:
+ Info.UsesVCC = true;
+ continue;
+
+ case AMDGPU::FLAT_SCR:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
+ continue;
+
+ case AMDGPU::XNACK_MASK:
+ case AMDGPU::XNACK_MASK_LO:
+ case AMDGPU::XNACK_MASK_HI:
+ llvm_unreachable("xnack_mask registers should not be used");
+
+ case AMDGPU::LDS_DIRECT:
+ llvm_unreachable("lds_direct register should not be used");
+
+ case AMDGPU::TBA:
+ case AMDGPU::TBA_LO:
+ case AMDGPU::TBA_HI:
+ case AMDGPU::TMA:
+ case AMDGPU::TMA_LO:
+ case AMDGPU::TMA_HI:
+ llvm_unreachable("trap handler registers should not be used");
+
+ case AMDGPU::SRC_VCCZ:
+ llvm_unreachable("src_vccz register should not be used");
+
+ case AMDGPU::SRC_EXECZ:
+ llvm_unreachable("src_execz register should not be used");
+
+ case AMDGPU::SRC_SCC:
+ llvm_unreachable("src_scc register should not be used");
+
+ default:
+ break;
+ }
+
+ if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
+ AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
+ AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 1;
+ } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
+ AMDGPU::VGPR_16RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 1;
+ } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
+ AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 1;
+ } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 2;
+ } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 2;
+ } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 2;
+ } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 3;
+ } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 3;
+ } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 3;
+ } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 4;
+ } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 4;
+ } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 4;
+ } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 5;
+ } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 5;
+ } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 5;
+ } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 6;
+ } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 6;
+ } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 6;
+ } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 7;
+ } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 7;
+ } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 7;
+ } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 8;
+ } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 8;
+ } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 8;
+ } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 9;
+ } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 9;
+ } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 9;
+ } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 10;
+ } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 10;
+ } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 10;
+ } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 11;
+ } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 11;
+ } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 11;
+ } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 12;
+ } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 12;
+ } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 12;
+ } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 16;
+ } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 16;
+ } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 16;
+ } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 32;
+ } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 32;
+ } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 32;
+ } else {
+ // We only expect TTMP registers or registers that do not belong to
+ // any RC.
+ assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
+ AMDGPU::TTMP_64RegClass.contains(Reg) ||
+ AMDGPU::TTMP_128RegClass.contains(Reg) ||
+ AMDGPU::TTMP_256RegClass.contains(Reg) ||
+ AMDGPU::TTMP_512RegClass.contains(Reg) ||
+ !TRI.getPhysRegBaseClass(Reg)) &&
+ "Unknown register class");
+ }
+ unsigned HWReg = TRI.getHWRegIndex(Reg);
+ int MaxUsed = HWReg + Width - 1;
+ if (IsSGPR) {
+ MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+ } else if (IsAGPR) {
+ MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
+ } else {
+ MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
+ }
+ }
+
if (MI.isCall()) {
// Pseudo used just to encode the underlying global. Is there a better
// way to track this?
@@ -219,5 +464,9 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
}
}
+ Info.NumExplicitSGPR = MaxSGPR + 1;
+ Info.NumVGPR = MaxVGPR + 1;
+ Info.NumAGPR = MaxAGPR + 1;
+
return Info;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 01718faaf5c2e..0e7635a045588 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -970,25 +970,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
return NumUserSGPRs;
}
- // Get the number of preloaded SGPRs for compute kernels.
unsigned getNumPreloadedSGPRs() const {
return NumUserSGPRs + NumSystemSGPRs;
}
- // Get the number of preloaded VGPRs for compute kernels.
- unsigned getNumPreloadedVGPRs() const {
- if (hasWorkItemIDZ())
- return ArgInfo.WorkItemIDZ.getRegister() - AMDGPU::VGPR0 + 1;
-
- if (hasWorkItemIDY())
- return ArgInfo.WorkItemIDY.getRegister() - AMDGPU::VGPR0 + 1;
-
- if (hasWorkItemIDX())
- return ArgInfo.WorkItemIDX.getRegister() - AMDGPU::VGPR0 + 1;
-
- return 0;
- }
-
unsigned getNumKernargPreloadedSGPRs() const {
return UserSGPRInfo.getNumKernargPreloadSGPRs();
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index b76823a128e07..e41189adfb46f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -4055,20 +4055,6 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
return 0;
}
-unsigned
-SIRegisterInfo::getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
- const TargetRegisterClass &RC) const {
- for (MCPhysReg Reg : reverse(RC.getRegisters())) {
- for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) {
- if (llvm::any_of(MRI.def_instructions(*AI), [](const MachineInstr &MI) {
- return !MI.isImplicitDef();
- }))
- return getHWRegIndex(Reg) + 1;
- }
- }
- return 0;
-}
-
SmallVector<StringLiteral>
SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7726762ad0e6d..a4b135d5e0b59 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -486,11 +486,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const;
- // \returns the number of registers of a given \p RC defined in a function.
- // Does not go inside function calls.
- unsigned getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
- const TargetRegisterClass &RC) const;
-
std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
: std::optional<uint8_t>{};
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index bdd86c1af6248..9b35920f8547a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 24
+; GPRIDX-NEXT: wavefront_sgpr_count = 17
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -3245,7 +3245,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = 18
+; GFX10-NEXT: wavefront_sgpr_count = 10
; GFX10-NEXT: workitem_vgpr_count = 3
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
@@ -3294,7 +3294,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
; GFX11-NEXT: granulated_workitem_vgpr_count = 0
-; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -3337,7 +3337,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = 16
+; GFX11-NEXT: wavefront_sgpr_count = 7
; GFX11-NEXT: workitem_vgpr_count = 3
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
@@ -4034,7 +4034,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -4077,8 +4077,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 24
-; GPRIDX-NEXT: workitem_vgpr_count = 3
+; GPRIDX-NEXT: wavefront_sgpr_count = 16
+; GPRIDX-NEXT: workitem_vgpr_count = 2
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
@@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4249,8 +4249,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = 18
-; GFX10-NEXT: workitem_vgpr_count = 3
+; GFX10-NEXT: wavefront_sgpr_count = 10
+; GFX10-NEXT: workitem_vgpr_count = 2
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
; GFX10-NEXT: reserved_sgpr_first = 0
@@ -4291,7 +4291,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
; GFX11-NEXT: granulated_workitem_vgpr_count = 0
-; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -4334,7 +4334,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = 16
+; GFX11-NEXT: wavefront_sgpr_count = 6
; GFX11-NEXT: workitem_vgpr_count = 2
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
@@ -4382,7 +4382,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -4425,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 24
+; GPRIDX-NEXT: wavefront_sgpr_count = 16
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4603,7 +4603,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = 18
+; GFX10-NEXT: wavefront_sgpr_count = 10
; GFX10-NEXT: workitem_vgpr_count = 3
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
@@ -4648,7 +4648,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
; GFX11-NEXT: granulated_workitem_vgpr_count = 0
-; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -4691,7 +4691,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = 16
+; GFX11-NEXT: wavefront_sgpr_count = 7
; GFX11-NEXT: workitem_vgpr_count = 3
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
index cc614bb24839c..7bf9a29e9ff44 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
@@ -13,9 +13,8 @@
; CHECK: {{^}}kernel_illegal_agpr_use_asm:
; CHECK: ; use a0
-; GFX908: NumVgprs: 3
-; GFX90A: NumVgprs: 1
-; CHECK: NumAgprs: 0
+; CHECK: NumVgprs: 0
+; CHECK: NumAgprs: 1
define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
call void asm sideeffect "; use $0", "a"(i32 poison)
ret void
@@ -25,7 +24,7 @@ define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
; CHECK: ; use a0
; CHECK: NumVgprs: 0
-; CHECK: NumAgprs: 0
+; CHECK: NumAgprs: 1
define void @func_illegal_agpr_use_asm() #0 {
call void asm sideeffect "; use $0", "a"(i32 poison)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index 7851de641c5a3..dd760c2a215ca 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -10,9 +10,9 @@
; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
; ASM: .amdhsa_user_sgpr_count 12
-; ASM: .amdhsa_next_free_sgpr 15
-; ASM: ; TotalNumSgprs: 21
-; ASM: ; NumSGPRsForWavesPerEU: 21
+; ASM: .amdhsa_next_free_sgpr 12
+; ASM: ; TotalNumSgprs: 18
+; ASM: ; NumSGPRsForWavesPerEU: 18
; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
; feild that are not explicitly referenced in the kernel. This test has 6 implicit
@@ -26,13 +26,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret
; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 0070 8000af00 94000000 08000800 00000000 ................
+; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000 @...............
; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
; ASM: .amdhsa_user_sgpr_count 10
-; ASM: .amdhsa_next_free_sgpr 11
-; ASM: ; TotalNumSgprs: 17
-; ASM: ; NumSGPRsForWavesPerEU: 17
+; ASM: .amdhsa_next_free_sgpr 10
+; ASM: ; TotalNumSgprs: 16
+; ASM: ; NumSGPRsForWavesPerEU: 16
; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
; implicit, and 6 extra.
@@ -46,9 +46,9 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 {
; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
; ASM: .amdhsa_user_sgpr_count 3
-; ASM: .amdhsa_next_free_sgpr 4
-; ASM: ; TotalNumSgprs: 10
-; ASM: ; NumSGPRsForWavesPerEU: 10
+; ASM: .amdhsa_next_free_sgpr 3
+; ASM: ; TotalNumSgprs: 9
+; ASM: ; NumSGPRsForWavesPerEU: 9
; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
@@ -57,13 +57,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 00f0 4000af00 84000000 08000000 00000000 @...............
+; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................
; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
; ASM: .amdhsa_user_sgpr_count 2
-; ASM: .amdhsa_next_free_sgpr 3
-; ASM: ; TotalNumSgprs: 9
-; ASM: ; NumSGPRsForWavesPerEU: 9
+; ASM: .amdhsa_next_free_sgpr 0
+; ASM: ; TotalNumSgprs: 6
+; ASM: ; NumSGPRsForWavesPerEU: 6
; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
; Encoded like '00'.
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 494ade73cb5f8..f4d17e50cf18c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -142,8 +142,8 @@ attributes #0 = { nounwind }
; GCN: amdpal.pipelines:
; GCN-NEXT: - .registers:
-; GFX8-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf010a{{$}}
-; GFX9-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf014a{{$}}
+; SDAG-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; GISEL-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
; GCN-NEXT: '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
; GCN-NEXT: .shader_functions:
; GCN-NEXT: dynamic_stack:
@@ -164,13 +164,13 @@ attributes #0 = { nounwind }
; GCN-NEXT: multiple_stack:
; GCN-NEXT: .backend_stack_size: 0x24{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
-; GCN-NEXT: .sgpr_count: 0x1{{$}}
+; GCN-NEXT: .sgpr_count: 0x21{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x24{{$}}
; GCN-NEXT: .vgpr_count: 0x3{{$}}
; GCN-NEXT: no_stack:
; GCN-NEXT: .backend_stack_size: 0{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
-; GCN-NEXT: .sgpr_count: 0x1{{$}}
+; GCN-NEXT: .sgpr_count: 0x20{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}}
; GCN-NEXT: .vgpr_count: 0x1{{$}}
; GCN-NEXT: no_stack_call:
@@ -203,7 +203,7 @@ attributes #0 = { nounwind }
; GCN-NEXT: simple_lds:
; GCN-NEXT: .backend_stack_size: 0{{$}}
; GCN-NEXT: .lds_size: 0x100{{$}}
-; GCN-NEXT: .sgpr_count: 0x1{{$}}
+; GCN-NEXT: .sgpr_count: 0x20{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}}
; GCN-NEXT: .vgpr_count: 0x1{{$}}
; GCN-NEXT: simple_lds_recurse:
@@ -215,7 +215,7 @@ attributes #0 = { nounwind }
; GCN-NEXT: simple_stack:
; GCN-NEXT: .backend_stack_size: 0x14{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
-; GCN-NEXT: .sgpr_count: 0x1{{$}}
+; GCN-NEXT: .sgpr_count: 0x21{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x14{{$}}
; GCN-NEXT: .vgpr_count: 0x2{{$}}
; GCN-NEXT: simple_stack_call:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
index 5ccf41c408b72..f52ba7000edeb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX11W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX11W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
; ELF: Section {
; ELF: Name: .text
@@ -23,16 +23,8 @@
; ELF: Section: .text (0x2)
; ELF: }
-; GFX10: NumSGPRsForWavesPerEU: 12
-; GFX10: NumVGPRsForWavesPerEU: 3
-
-; Wave32 and 64 behave
diff erently due to the UserSGPRInit16Bug,
-; which only affects Wave32.
-; GFX11W32: NumSGPRsForWavesPerEU: 16
-; GFX11W32: NumVGPRsForWavesPerEU: 1
-
-; GFX11W64: NumSGPRsForWavesPerEU: 11
-; GFX11W64: NumVGPRsForWavesPerEU: 1
+; GFX10: NumSGPRsForWavesPerEU: 6
+; GFX10: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @simple(ptr addrspace(1) %out) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 0e0a81d4657df..616867481d177 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -2,10 +2,10 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=HSAMD %s
; CHECK-LABEL: {{^}}min_64_max_64:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @min_64_max_64() #0 {
entry:
ret void
@@ -13,10 +13,10 @@ entry:
attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
; CHECK-LABEL: {{^}}min_64_max_128:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @min_64_max_128() #1 {
entry:
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 5617a80fc94b4..e9fe4f3c618c7 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -26,10 +26,10 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
; Exactly 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_exactly_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @empty_exactly_10() #2 {
entry:
ret void
@@ -38,10 +38,10 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
; At least 1 wave per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_1:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @empty_at_least_1() #3 {
entry:
ret void
@@ -50,10 +50,10 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}
; At least 5 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_5:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @empty_at_least_5() #4 {
entry:
ret void
@@ -62,10 +62,10 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}
; At least 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @empty_at_least_10() #5 {
entry:
ret void
@@ -88,10 +88,10 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64
; At most 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_most_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @empty_at_most_10() #7 {
entry:
ret void
@@ -102,10 +102,10 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
; Between 5 and 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_between_5_and_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @empty_between_5_and_10() #8 {
entry:
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
index efa416e301ccc..2e79d8bab46a6 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -28,7 +28,7 @@ bb:
}
; ALL: .set .Laliasee_default.num_vgpr, 0
; ALL-NEXT: .set .Laliasee_default.num_agpr, 27
-; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 0
+; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 32
attributes #0 = { noinline norecurse nounwind optnone }
attributes #1 = { noinline norecurse nounwind readnone willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
index 62ca985bc6400..337da5d0ecbe0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -18,7 +18,7 @@ bb:
; CHECK: .set .Laliasee_default_vgpr64_sgpr102.num_vgpr, 53
; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 32
define internal void @aliasee_default_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
index 344f8200608f6..075eddd2763d3 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -24,7 +24,7 @@ bb:
; CHECK: .set .Laliasee_vgpr32_sgpr76.num_vgpr, 27
; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 32
define internal void @aliasee_vgpr32_sgpr76() #1 {
bb:
call void asm sideeffect "; clobber v26 ", "~{v26}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
index 3d36f8a514c47..4fd181d3c0f43 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -21,7 +21,7 @@ bb:
; CHECK: .set .Laliasee_vgpr64_sgpr102.num_vgpr, 53
; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 32
define internal void @aliasee_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
index 2274c437daf62..00f72d5d8b1dd 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -21,7 +21,7 @@ bb:
; CHECK: .set .Laliasee_vgpr256_sgpr102.num_vgpr, 253
; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 33
define internal void @aliasee_vgpr256_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v252 ", "~{v252}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index db1269e8e95c2..dbd00f09943c0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -7,7 +7,7 @@
; Make sure to run a GPU with the SGPR allocation bug.
; GCN-LABEL: {{^}}use_vcc:
-; GCN: ; TotalNumSgprs: 2
+; GCN: ; TotalNumSgprs: 34
; GCN: ; NumVgprs: 0
define void @use_vcc() #1 {
call void asm sideeffect "", "~{vcc}" () #0
@@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
}
; GCN-LABEL: {{^}}use_flat_scratch:
-; CI: ; TotalNumSgprs: 4
-; VI: ; TotalNumSgprs: 6
+; CI: ; TotalNumSgprs: 36
+; VI: ; TotalNumSgprs: 38
; GCN: ; NumVgprs: 0
define void @use_flat_scratch() #1 {
call void asm sideeffect "", "~{flat_scratch}" () #0
@@ -234,7 +234,7 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
; Make sure there's no assert when a sgpr96 is used.
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
; VI-BUG: TotalNumSgprs: 96
@@ -249,7 +249,7 @@ entry:
; Make sure there's no assert when a sgpr160 is used.
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
; VI-BUG: TotalNumSgprs: 96
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
index 55dc394628176..61830f18ad7a7 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
; It's probably OK if this is slightly higher:
-; CHECK: ; NumVgprs: 5
+; CHECK: ; NumVgprs: 8
define amdgpu_kernel void @foobar(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %flag) {
entry:
%cmpflag = icmp eq i32 %flag, 1
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index d8d7494d0dc1c..3fe3cafd729a7 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -16,7 +16,7 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -35,7 +35,7 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -93,7 +93,7 @@ entry:
; registers used.
;
; ALL-ASM-LABEL: {{^}}empty:
-; ALL-ASM: .amdhsa_next_free_vgpr 3
+; ALL-ASM: .amdhsa_next_free_vgpr 1
; ALL-ASM: .amdhsa_next_free_sgpr 1
define amdgpu_kernel void @empty(
i32 %i,
diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
index 59cf9825116fa..22d699a8f4809 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
@@ -43,7 +43,7 @@
; OSABI-HSA-ELF: .sgpr_count: 96
; OSABI-HSA-ELF: .sgpr_spill_count: 0
; OSABI-HSA-ELF: .symbol: elf_notes.kd
-; OSABI-HSA-ELF: .vgpr_count: 1
+; OSABI-HSA-ELF: .vgpr_count: 0
; OSABI-HSA-ELF: .vgpr_spill_count: 0
; OSABI-HSA-ELF: .wavefront_size: 64
; OSABI-HSA-ELF: amdhsa.target: amdgcn-amd-amdhsa--gfx802
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index ed1f3e1397abc..a59382ba20dc5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -27,15 +27,15 @@
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: ; TotalNumSgprs: 12
-; VI-NOXNACK: ; TotalNumSgprs: 12
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 18
-; VI-XNACK: ; TotalNumSgprs: 16
-; HSA-VI-XNACK: ; TotalNumSgprs: 22
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; CI: ; TotalNumSgprs: 8
+; VI-NOXNACK: ; TotalNumSgprs: 8
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 8
+; VI-XNACK: ; TotalNumSgprs: 12
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
define amdgpu_kernel void @no_vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7}"()
@@ -50,15 +50,15 @@ entry:
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: ; TotalNumSgprs: 14
-; VI-NOXNACK: ; TotalNumSgprs: 14
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 20
-; VI-XNACK: ; TotalNumSgprs: 16
-; HSA-VI-XNACK: ; TotalNumSgprs: 22
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
+; CI: ; TotalNumSgprs: 10
+; VI-NOXNACK: ; TotalNumSgprs: 10
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 10
+; VI-XNACK: ; TotalNumSgprs: 12
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
define amdgpu_kernel void @vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -73,15 +73,15 @@ entry:
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: ; TotalNumSgprs: 16
-; VI-NOXNACK: ; TotalNumSgprs: 18
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 18
+; VI-XNACK: ; TotalNumSgprs: 14
; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
define amdgpu_kernel void @no_vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -96,15 +96,15 @@ entry:
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: ; TotalNumSgprs: 16
-; VI-NOXNACK: ; TotalNumSgprs: 18
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 18
+; VI-XNACK: ; TotalNumSgprs: 14
; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
define amdgpu_kernel void @vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -122,15 +122,15 @@ entry:
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -143,15 +143,15 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr_lo() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -166,15 +166,15 @@ entry:
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr_hi() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index 2a18d40e0bd8a..59bcc5d8be9b1 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -5,14 +5,14 @@
; GCN-LABEL: {{^}}use_vcc:
; GCN: .set use_vcc.num_vgpr, 0
; GCN: .set use_vcc.num_agpr, 0
-; GCN: .set use_vcc.numbered_sgpr, 0
+; GCN: .set use_vcc.numbered_sgpr, 32
; GCN: .set use_vcc.private_seg_size, 0
; GCN: .set use_vcc.uses_vcc, 1
; GCN: .set use_vcc.uses_flat_scratch, 0
; GCN: .set use_vcc.has_dyn_sized_stack, 0
; GCN: .set use_vcc.has_recursion, 0
; GCN: .set use_vcc.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
; GCN: NumVgprs: 0
; GCN: ScratchSize: 0
define void @use_vcc() #1 {
@@ -59,14 +59,14 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
; GCN-LABEL: {{^}}use_flat_scratch:
; GCN: .set use_flat_scratch.num_vgpr, 0
; GCN: .set use_flat_scratch.num_agpr, 0
-; GCN: .set use_flat_scratch.numbered_sgpr, 0
+; GCN: .set use_flat_scratch.numbered_sgpr, 32
; GCN: .set use_flat_scratch.private_seg_size, 0
; GCN: .set use_flat_scratch.uses_vcc, 0
; GCN: .set use_flat_scratch.uses_flat_scratch, 1
; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
; GCN: .set use_flat_scratch.has_recursion, 0
; GCN: .set use_flat_scratch.has_indirect_call, 0
-; GCN: TotalNumSgprs: 6
+; GCN: TotalNumSgprs: 38
; GCN: NumVgprs: 0
; GCN: ScratchSize: 0
define void @use_flat_scratch() #1 {
@@ -113,14 +113,14 @@ define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace
; GCN-LABEL: {{^}}use_10_vgpr:
; GCN: .set use_10_vgpr.num_vgpr, 10
; GCN: .set use_10_vgpr.num_agpr, 0
-; GCN: .set use_10_vgpr.numbered_sgpr, 0
+; GCN: .set use_10_vgpr.numbered_sgpr, 32
; GCN: .set use_10_vgpr.private_seg_size, 0
; GCN: .set use_10_vgpr.uses_vcc, 0
; GCN: .set use_10_vgpr.uses_flat_scratch, 0
; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
; GCN: .set use_10_vgpr.has_recursion, 0
; GCN: .set use_10_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
; GCN: NumVgprs: 10
; GCN: ScratchSize: 0
define void @use_10_vgpr() #1 {
@@ -168,14 +168,14 @@ define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
; GCN-LABEL: {{^}}use_50_vgpr:
; GCN: .set use_50_vgpr.num_vgpr, 50
; GCN: .set use_50_vgpr.num_agpr, 0
-; GCN: .set use_50_vgpr.numbered_sgpr, 0
+; GCN: .set use_50_vgpr.numbered_sgpr, 32
; GCN: .set use_50_vgpr.private_seg_size, 0
; GCN: .set use_50_vgpr.uses_vcc, 0
; GCN: .set use_50_vgpr.uses_flat_scratch, 0
; GCN: .set use_50_vgpr.has_dyn_sized_stack, 0
; GCN: .set use_50_vgpr.has_recursion, 0
; GCN: .set use_50_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
; GCN: NumVgprs: 50
; GCN: ScratchSize: 0
define void @use_50_vgpr() #1 {
@@ -258,14 +258,14 @@ define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
; GCN-LABEL: {{^}}use_stack0:
; GCN: .set use_stack0.num_vgpr, 1
; GCN: .set use_stack0.num_agpr, 0
-; GCN: .set use_stack0.numbered_sgpr, 0
+; GCN: .set use_stack0.numbered_sgpr, 33
; GCN: .set use_stack0.private_seg_size, 2052
; GCN: .set use_stack0.uses_vcc, 0
; GCN: .set use_stack0.uses_flat_scratch, 0
; GCN: .set use_stack0.has_dyn_sized_stack, 0
; GCN: .set use_stack0.has_recursion, 0
; GCN: .set use_stack0.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 37
; GCN: NumVgprs: 1
; GCN: ScratchSize: 2052
define void @use_stack0() #1 {
@@ -277,14 +277,14 @@ define void @use_stack0() #1 {
; GCN-LABEL: {{^}}use_stack1:
; GCN: .set use_stack1.num_vgpr, 1
; GCN: .set use_stack1.num_agpr, 0
-; GCN: .set use_stack1.numbered_sgpr, 0
+; GCN: .set use_stack1.numbered_sgpr, 33
; GCN: .set use_stack1.private_seg_size, 404
; GCN: .set use_stack1.uses_vcc, 0
; GCN: .set use_stack1.uses_flat_scratch, 0
; GCN: .set use_stack1.has_dyn_sized_stack, 0
; GCN: .set use_stack1.has_recursion, 0
; GCN: .set use_stack1.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 37
; GCN: NumVgprs: 1
; GCN: ScratchSize: 404
define void @use_stack1() #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index bf452a9e38e01..cd89a36fe538b 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,7 +15,7 @@
; CHECK: .max_flat_workgroup_size: 1024
; CHECK: .name: test
; CHECK: .private_segment_fixed_size: 0
-; CHECK: .sgpr_count: 16
+; CHECK: .sgpr_count: 10
; CHECK: .symbol: test.kd
; CHECK: .vgpr_count: {{3|6}}
; WAVE64: .wavefront_size: 64
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index f7e3498907005..024593c49dba1 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -63,7 +63,7 @@
; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7
; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E
; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
-; ELF: 0250: 6770725F 636F756E 7401B12E 76677072
+; ELF: 0250: 6770725F 636F756E 7402B12E 76677072
; ELF: 0260: 5F737069 6C6C5F63 6F756E74 00AF2E77
; ELF: 0270: 61766566 726F6E74 5F73697A 6540AD61
; ELF: 0280: 6D646873 612E7461 72676574 BD616D64
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
deleted file mode 100644
index 45de8a79fe88d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Use VGPRs above the input arguments.
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0x1d{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
- i32 %vcr, { i32 } %system.data,
- i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
- i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
- i32 %inactive.vgpr8, i32 %inactive.vgpr9)
- local_unnamed_addr {
-entry:
- %system.data.value = extractvalue { i32 } %system.data, 0
- %dead.val = call i32 @llvm.amdgcn.dead.i32()
- %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
- br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
- %system.data.extract = extractvalue { i32 } %system.data, 0
- %data.mul = mul i32 %system.data.extract, 2
- %data.add = add i32 %data.mul, 1
- call void asm sideeffect "; clobber v28", "~{v28}"()
- br label %tail
-
-tail:
- %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
- %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
- %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
- %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
- %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
- %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
- %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
- %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
- %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
- %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
- %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
- %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
- %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
- %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
- %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
- %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
- %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
- %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
- %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
- %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
- %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
- %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
- %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
- %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
- %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
- %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
- %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
- %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
- call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
- @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
- ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
- { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
- i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
- unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
deleted file mode 100644
index 9c636d4516a80..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; CHECK-LABEL: leaf_shader:
-; CHECK: .vgpr_count:{{.*}}0x1{{$}}
-
-; Function without calls.
-define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
- i32 %active.vgpr1, i32 %active.vgpr2,
- i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
- i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
- local_unnamed_addr {
-entry:
- %dead.val = call i32 @llvm.amdgcn.dead.i32()
- %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
- br i1 %is.whole.wave, label %compute, label %merge
-
-compute:
- ; Perform a more complex computation using active VGPRs
- %square = mul i32 %active.vgpr1, %active.vgpr1
- %product = mul i32 %square, %active.vgpr2
- %sum = add i32 %product, %input.value
- %result = add i32 %sum, 42
- br label %merge
-
-merge:
- %final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
- %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
- %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
- %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
- %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
- %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
- %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]
-
- store i32 %final.result, ptr %output.ptr, align 4
-
- ret void
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
deleted file mode 100644
index 1b0d33cec7052..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes.
-; In that case, the VGPR should be included in the .vgpr_count
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0xd{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
- i32 %vcr, { i32 } %system.data,
- i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
- i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
- i32 %inactive.vgpr8, i32 %inactive.vgpr9)
- local_unnamed_addr {
-entry:
- %system.data.value = extractvalue { i32 } %system.data, 0
- %dead.val = call i32 @llvm.amdgcn.dead.i32()
- %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
- br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
- %system.data.extract = extractvalue { i32 } %system.data, 0
- %data.mul = mul i32 %system.data.extract, 2
- %data.add = add i32 %data.mul, 1
- call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"()
- br label %tail
-
-tail:
- %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
- %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
- %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
- %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
- %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
- %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
- %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
- %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
- %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
- %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
- %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
- %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
- %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
- %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
- %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
- %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
- %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
- %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
- %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
- %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
- %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
- %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
- %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
- %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
- %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
- %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
- %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
- %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
- call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
- @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
- ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
- { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
- i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
- unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
deleted file mode 100644
index 9408501718784..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0xa{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
- i32 %vcr, { i32 } %system.data,
- i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
- i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
- i32 %inactive.vgpr8, i32 %inactive.vgpr9)
- local_unnamed_addr {
-entry:
- %system.data.value = extractvalue { i32 } %system.data, 0
- %dead.val = call i32 @llvm.amdgcn.dead.i32()
- %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
- br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
- %system.data.extract = extractvalue { i32 } %system.data, 0
- %data.mul = mul i32 %system.data.extract, 2
- %data.add = add i32 %data.mul, 1
- br label %tail
-
-tail:
- %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
- %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
- %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
- %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
- %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
- %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
- %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
- %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
- %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
- %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
- %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
- %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
- %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
- %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
- %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
- %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
- %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
- %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
- %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
- %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
- %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
- %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
- %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
- %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
- %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
- %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
- %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
- %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
- call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
- @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
- ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
- { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
- i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
- unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index c3b033113431f..464cd820028cc 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -64,7 +64,7 @@ define void @func_regular_call() #1 {
; GCN-NEXT: s_addc_u32 s17,
; GCN-NEXT: s_setpc_b64 s[16:17]
-; GCN: ; TotalNumSgprs: 18
+; GCN: ; TotalNumSgprs: 32
; GCN: ; NumVgprs: 8
define void @func_tail_call() #1 {
tail call void @func()
diff --git a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
index 03694b913d6e0..60bbf4646ee03 100644
--- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
+++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
@@ -24,7 +24,7 @@ define void @I_Quit() {
; CHECK-LABEL: P_RemoveMobj:
; CHECK: .set P_RemoveMobj.num_vgpr, 0
; CHECK: .set P_RemoveMobj.num_agpr, 0
-; CHECK: .set P_RemoveMobj.numbered_sgpr, 0
+; CHECK: .set P_RemoveMobj.numbered_sgpr, 32
; CHECK: .set P_RemoveMobj.private_seg_size, 0
; CHECK: .set P_RemoveMobj.uses_vcc, 0
; CHECK: .set P_RemoveMobj.uses_flat_scratch, 0
@@ -38,7 +38,7 @@ define void @P_RemoveMobj() {
; CHECK-LABEL: P_SpawnMobj:
; CHECK: .set P_SpawnMobj.num_vgpr, 0
; CHECK: .set P_SpawnMobj.num_agpr, 0
-; CHECK: .set P_SpawnMobj.numbered_sgpr, 0
+; CHECK: .set P_SpawnMobj.numbered_sgpr, 32
; CHECK: .set P_SpawnMobj.private_seg_size, 0
; CHECK: .set P_SpawnMobj.uses_vcc, 0
; CHECK: .set P_SpawnMobj.uses_flat_scratch, 0
@@ -52,7 +52,7 @@ define void @P_SpawnMobj() {
; CHECK-LABEL: G_PlayerReborn:
; CHECK: .set G_PlayerReborn.num_vgpr, 0
; CHECK: .set G_PlayerReborn.num_agpr, 0
-; CHECK: .set G_PlayerReborn.numbered_sgpr, 0
+; CHECK: .set G_PlayerReborn.numbered_sgpr, 32
; CHECK: .set G_PlayerReborn.private_seg_size, 0
; CHECK: .set G_PlayerReborn.uses_vcc, 0
; CHECK: .set G_PlayerReborn.uses_flat_scratch, 0
@@ -66,7 +66,7 @@ define void @G_PlayerReborn() {
; CHECK-LABEL: P_SetThingPosition:
; CHECK: .set P_SetThingPosition.num_vgpr, 0
; CHECK: .set P_SetThingPosition.num_agpr, 0
-; CHECK: .set P_SetThingPosition.numbered_sgpr, 0
+; CHECK: .set P_SetThingPosition.numbered_sgpr, 32
; CHECK: .set P_SetThingPosition.private_seg_size, 0
; CHECK: .set P_SetThingPosition.uses_vcc, 0
; CHECK: .set P_SetThingPosition.uses_flat_scratch, 0
@@ -96,7 +96,7 @@ define void @P_SetupPsprites(ptr addrspace(1) %i) {
; CHECK-LABEL: HU_Start:
; CHECK: .set HU_Start.num_vgpr, 0
; CHECK: .set HU_Start.num_agpr, 0
-; CHECK: .set HU_Start.numbered_sgpr, 0
+; CHECK: .set HU_Start.numbered_sgpr, 32
; CHECK: .set HU_Start.private_seg_size, 0
; CHECK: .set HU_Start.uses_vcc, 0
; CHECK: .set HU_Start.uses_flat_scratch, 0
@@ -162,7 +162,7 @@ define void @G_DoReborn() {
; CHECK-LABEL: AM_Stop:
; CHECK: .set AM_Stop.num_vgpr, 0
; CHECK: .set AM_Stop.num_agpr, 0
-; CHECK: .set AM_Stop.numbered_sgpr, 0
+; CHECK: .set AM_Stop.numbered_sgpr, 32
; CHECK: .set AM_Stop.private_seg_size, 0
; CHECK: .set AM_Stop.uses_vcc, 0
; CHECK: .set AM_Stop.uses_flat_scratch, 0
@@ -176,7 +176,7 @@ define void @AM_Stop() {
; CHECK-LABEL: D_AdvanceDemo:
; CHECK: .set D_AdvanceDemo.num_vgpr, 0
; CHECK: .set D_AdvanceDemo.num_agpr, 0
-; CHECK: .set D_AdvanceDemo.numbered_sgpr, 0
+; CHECK: .set D_AdvanceDemo.numbered_sgpr, 32
; CHECK: .set D_AdvanceDemo.private_seg_size, 0
; CHECK: .set D_AdvanceDemo.uses_vcc, 0
; CHECK: .set D_AdvanceDemo.uses_flat_scratch, 0
@@ -190,7 +190,7 @@ define void @D_AdvanceDemo() {
; CHECK-LABEL: F_StartFinale:
; CHECK: .set F_StartFinale.num_vgpr, 0
; CHECK: .set F_StartFinale.num_agpr, 0
-; CHECK: .set F_StartFinale.numbered_sgpr, 0
+; CHECK: .set F_StartFinale.numbered_sgpr, 32
; CHECK: .set F_StartFinale.private_seg_size, 0
; CHECK: .set F_StartFinale.uses_vcc, 0
; CHECK: .set F_StartFinale.uses_flat_scratch, 0
@@ -204,7 +204,7 @@ define void @F_StartFinale() {
; CHECK-LABEL: F_Ticker:
; CHECK: .set F_Ticker.num_vgpr, 0
; CHECK: .set F_Ticker.num_agpr, 0
-; CHECK: .set F_Ticker.numbered_sgpr, 0
+; CHECK: .set F_Ticker.numbered_sgpr, 32
; CHECK: .set F_Ticker.private_seg_size, 0
; CHECK: .set F_Ticker.uses_vcc, 0
; CHECK: .set F_Ticker.uses_flat_scratch, 0
@@ -236,7 +236,7 @@ define i32 @G_CheckDemoStatus() {
; CHECK-LABEL: P_TempSaveGameFile:
; CHECK: .set P_TempSaveGameFile.num_vgpr, 2
; CHECK: .set P_TempSaveGameFile.num_agpr, 0
-; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 0
+; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 32
; CHECK: .set P_TempSaveGameFile.private_seg_size, 0
; CHECK: .set P_TempSaveGameFile.uses_vcc, 0
; CHECK: .set P_TempSaveGameFile.uses_flat_scratch, 0
@@ -250,7 +250,7 @@ define ptr @P_TempSaveGameFile() {
; CHECK-LABEL: P_SaveGameFile:
; CHECK: .set P_SaveGameFile.num_vgpr, 2
; CHECK: .set P_SaveGameFile.num_agpr, 0
-; CHECK: .set P_SaveGameFile.numbered_sgpr, 0
+; CHECK: .set P_SaveGameFile.numbered_sgpr, 32
; CHECK: .set P_SaveGameFile.private_seg_size, 0
; CHECK: .set P_SaveGameFile.uses_vcc, 0
; CHECK: .set P_SaveGameFile.uses_flat_scratch, 0
@@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() {
; CHECK-LABEL: R_FlatNumForName:
; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr)
; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr)
-; CHECK: .set R_FlatNumForName.numbered_sgpr, max(34, I_Error.numbered_sgpr)
+; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr)
; CHECK: .set R_FlatNumForName.private_seg_size, 16+max(I_Error.private_seg_size)
; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc)
; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch)
diff --git a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
index 83f58db1aa67f..7a810d0067c17 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
@@ -3,7 +3,7 @@
; CHECK-LABEL: {{^}}qux
; CHECK: .set qux.num_vgpr, 13
; CHECK: .set qux.num_agpr, 0
-; CHECK: .set qux.numbered_sgpr, 0
+; CHECK: .set qux.numbered_sgpr, 32
; CHECK: .set qux.private_seg_size, 0
; CHECK: .set qux.uses_vcc, 0
; CHECK: .set qux.uses_flat_scratch, 0
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index 28c3131302a31..638dc8965987e 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -83,13 +83,13 @@
; CHECK-NEXT: multiple_stack:
; CHECK-NEXT: .backend_stack_size: 0x24
; CHECK-NEXT: .lds_size: 0
-; CHECK-NEXT: .sgpr_count: 0x1
+; CHECK-NEXT: .sgpr_count: 0x21
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24
; CHECK-NEXT: .vgpr_count: 0x3
; CHECK-NEXT: no_stack:
; CHECK-NEXT: .backend_stack_size: 0
; CHECK-NEXT: .lds_size: 0
-; CHECK-NEXT: .sgpr_count: 0x1
+; CHECK-NEXT: .sgpr_count: 0x20
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: no_stack_call:
@@ -122,7 +122,7 @@
; CHECK-NEXT: simple_lds:
; CHECK-NEXT: .backend_stack_size: 0
; CHECK-NEXT: .lds_size: 0x100
-; CHECK-NEXT: .sgpr_count: 0x1
+; CHECK-NEXT: .sgpr_count: 0x20
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: simple_lds_recurse:
@@ -134,7 +134,7 @@
; CHECK-NEXT: simple_stack:
; CHECK-NEXT: .backend_stack_size: 0x14
; CHECK-NEXT: .lds_size: 0
-; CHECK-NEXT: .sgpr_count: 0x1
+; CHECK-NEXT: .sgpr_count: 0x21
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14
; CHECK-NEXT: .vgpr_count: 0x2
; CHECK-NEXT: simple_stack_call:
diff --git a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
index a71fd7fe782ff..5b9b0feea9900 100644
--- a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
@@ -2,7 +2,7 @@
;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
-; ;CHECK: NumVgprs: 2
+; ;CHECK: NumVgprs: 4
define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
.entry:
%i1 = extractelement <2 x float> %arg3, i32 1
@@ -193,7 +193,7 @@ define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float>
; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused:
-; CHECK: NumVgprs: 2
+; CHECK: NumVgprs: 4
define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
.entry:
ret { <4 x float> } undef
@@ -202,7 +202,7 @@ define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg
; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
; Additionally set the PSInputAddr to 0 via the metadata
; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0:
-; CHECK: NumVgprs: 2
+; CHECK: NumVgprs: 4
define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 {
.entry:
ret { <4 x float> } undef
diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
index bfcf90037bfd3..35e11ad6a648b 100644
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -24,9 +24,7 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %
; SI-LABEL: {{^}}one_vgpr_used:
; SI: NumVgprs: 1
-define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) #0 {
+define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) nounwind {
store i32 %x, ptr addrspace(1) %out, align 4
ret void
}
-
-attributes #0 = { nounwind noinline "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index a2470a60cb19f..afb77ed190896 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -122,8 +122,8 @@ define void @test_func() !dbg !6 {
}
; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 22
-; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 3
+; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 4
+; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0
; STDERR-NEXT: remark: foo.cl:8:0: Dynamic Stack: False
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
index 557ffd27a07f6..0d25bc97ff775 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
@@ -4,8 +4,8 @@
; CHECK-LABEL: {{^}}spill:
; GCN: NumSgprs: 104
; GCN-GCNTRACKERS: NumSgprs: 104
-; GCN: NumVgprs: 3
-; GCN-GCNTRACKERS: NumVgprs: 3
+; GCN: NumVgprs: 1
+; GCN-GCNTRACKERS: NumVgprs: 2
; GCN: ScratchSize: 0
; GCN-GCNTRACKERS: ScratchSize: 0
; GCN: Occupancy: 5
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index 95d707aee5662..c5732531f5423 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -11,8 +11,8 @@
; allow scheduling of other instructions which reduce RP
; CHECK-LABEL: {{^}}return_72xi32:
-; GFX11-PAL: NumSgprs: 0
-; GFX11-PAL-GCNTRACKERS: NumSgprs: 0
+; GFX11-PAL: NumSgprs: 33
+; GFX11-PAL-GCNTRACKERS: NumSgprs: 33
; GFX11-PAL: NumVgprs: 64
; GFX11-PAL-GCNTRACKERS: NumVgprs: 64
; GFX11-PAL: ScratchSize: 220
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 8300a52955b91..462ac23ec7e0e 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -7,14 +7,14 @@
; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s
; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s
-; SI-MINREG: NumSgprs: {{[1]?[1-9]$}}
-; SI-MINREG: NumVgprs: {{[1]?[1-9]$}}
+; SI-MINREG: NumSgprs: {{[1-9]$}}
+; SI-MINREG: NumVgprs: {{[1-9]$}}
; SI-MAXOCC: NumSgprs: {{[1-4]?[0-9]$}}
; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}}
; stores may alias loads
-; VI-MINREG: NumSgprs: {{[1]?[0-9]$}}
+; VI-MINREG: NumSgprs: {{[0-9]$}}
; VI-MINREG: NumVgprs: {{[1-3][0-9]$}}
; stores may alias loads
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 682bbdedb37a3..6ddf0986755f9 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -35,7 +35,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr 3
+; VI-NEXT: .amdhsa_next_free_vgpr 1
; VI-NEXT: .amdhsa_next_free_sgpr 18
; VI-NEXT: .amdhsa_reserve_vcc 0
; VI-NEXT: .amdhsa_reserve_flat_scratch 0
@@ -86,7 +86,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr 3
+; GFX9-NEXT: .amdhsa_next_free_vgpr 1
; GFX9-NEXT: .amdhsa_next_free_sgpr 18
; GFX9-NEXT: .amdhsa_reserve_vcc 0
; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
@@ -146,7 +146,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr 3
+; VI-NEXT: .amdhsa_next_free_vgpr 1
; VI-NEXT: .amdhsa_next_free_sgpr 18
; VI-NEXT: .amdhsa_reserve_vcc 0
; VI-NEXT: .amdhsa_reserve_flat_scratch 0
@@ -197,7 +197,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr 3
+; GFX9-NEXT: .amdhsa_next_free_vgpr 1
; GFX9-NEXT: .amdhsa_next_free_sgpr 18
; GFX9-NEXT: .amdhsa_reserve_vcc 0
; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
@@ -257,7 +257,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr 3
+; VI-NEXT: .amdhsa_next_free_vgpr 1
; VI-NEXT: .amdhsa_next_free_sgpr 18
; VI-NEXT: .amdhsa_reserve_vcc 0
; VI-NEXT: .amdhsa_reserve_flat_scratch 0
@@ -308,7 +308,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr 3
+; GFX9-NEXT: .amdhsa_next_free_vgpr 1
; GFX9-NEXT: .amdhsa_next_free_sgpr 18
; GFX9-NEXT: .amdhsa_reserve_vcc 0
; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index d3def45c4f9d2..30accc846d2b6 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -6,7 +6,7 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_reserve_xnack_mask 1
; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
; ELF: AMDGPU Metadata
-; ELF: .sgpr_count: 12
+; ELF: .sgpr_count: 9
entry:
tail call void asm sideeffect "", "~{s[0:4]}"()
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index ad831e040d722..4f84b31f1877b 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -6,7 +6,7 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_reserve_xnack_mask 0
; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
; ELF: AMDGPU Metadata
-; ELF: .sgpr_count: 8
+; ELF: .sgpr_count: 5
entry:
tail call void asm sideeffect "", "~{s[0:4]}"()
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index d1e28e11601ce..644f434923368 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -6,7 +6,7 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_reserve_xnack_mask 1
; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
; ELF: AMDGPU Metadata
-; ELF: .sgpr_count: 12
+; ELF: .sgpr_count: 9
entry:
tail call void asm sideeffect "", "~{s[0:4]}"()
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
index 4802ec861d685..cf5b95a729974 100644
--- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
@@ -3,7 +3,7 @@
; CHECK-LABEL: __unnamed_1:
; CHECK: .set __unnamed_1.num_vgpr, 0
; CHECK: .set __unnamed_1.num_agpr, 0
-; CHECK: .set __unnamed_1.numbered_sgpr, 0
+; CHECK: .set __unnamed_1.numbered_sgpr, 32
; CHECK: .set __unnamed_1.private_seg_size, 0
; CHECK: .set __unnamed_1.uses_vcc, 0
; CHECK: .set __unnamed_1.uses_flat_scratch, 0
@@ -16,7 +16,7 @@ entry:
}
; CHECK-LABEL: __unnamed_2:
-; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr)
+; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr)
; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr)
; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr)
; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size)
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
index ee35dc4cddade..2cb5e309c8c21 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
@@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 {
}
; GCN-LABEL: {{^}}f1024_0:
-; GFX90A: NumVgprs: 1
+; GFX90A: NumVgprs: 32
; GFX90A: NumAgprs: 1
-; GFX90A: TotalNumVgprs: 5
+; GFX90A: TotalNumVgprs: 33
define void @f1024_0() #1024 {
call void @foo()
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
deleted file mode 100644
index 8c8182db7b479..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED
-; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
-target triple = "amdgcn-amd-amdhsa"
-
- at global = addrspace(1) global i32 poison, align 4
-
-; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR.
-; Only hardware-initialized VGPRs (v0) are read in this kernel.
-
-; CHECK-LABEL: amdhsa.kernels:
-; CHECK-LABEL: kernel_x
-; CHECK: .vgpr_count: 1
-define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 {
-entry:
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
- ret void
-}
-
-; CHECK-LABEL: kernel_z
-; PACKED: .vgpr_count: 1
-; NOTPACKED: .vgpr_count: 3
-define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) {
-entry:
- %id = call i32 @llvm.amdgcn.workitem.id.z()
- call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
- ret void
-}
-
-attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
deleted file mode 100644
index f5d28a0ae1628..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
-; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
-target triple = "amdgcn--amdpal"
-
- at global = addrspace(1) global i32 poison, align 4
-
-; CHECK-LABEL: amdpal.pipelines:
-
-; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used.
-; CHECK-LABEL: .entry_point_symbol: kernel_use
-; CHECK: .vgpr_count: 0x20
-define amdgpu_cs void @kernel_use([32 x i32] %args) {
-entry:
- %a = extractvalue [32 x i32] %args, 14
- store i32 %a, ptr addrspace(1) @global
- ret void
-}
-
-; Neither uses not writes a VGPR
-; CHECK-LABEL: chain_func:
-; CHECK: .vgpr_count: 0x1
-define amdgpu_cs_chain void @chain_func([32 x i32] %args) {
-entry:
- call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a(
- ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0)
- unreachable
-}
-
-; Neither uses not writes a VGPR
-; CHECK-LABEL: gfx_func:
-; CHECK: .vgpr_count: 0x1
-define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) {
-entry:
- ret [32 x i32] %args
-}
More information about the llvm-commits
mailing list