[llvm] [AMDGPU] Allocate scratch space for dVGPRs for CWSR (PR #130055)
Diana Picus via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 19 02:32:42 PDT 2025
https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/130055
>From 3cacd079d8671473213fd273a8df6839723728a2 Mon Sep 17 00:00:00 2001
From: Jannik Silvanus <jannik.silvanus at amd.com>
Date: Thu, 30 Mar 2023 17:45:38 +0200
Subject: [PATCH 01/12] [AMDGPU] Add GFX12 S_ALLOC_VGPR instruction
This patch only adds the instruction for disassembly support.
We neither have an instrinsic nor codegen support, and it is
unclear whether we actually want to ever have an intrinsic,
given the fragile semantics.
For now, it will be generated only by the backend in very specific
circumstances.
---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 7 +++++++
llvm/test/MC/AMDGPU/gfx11_unsupported.s | 3 +++
llvm/test/MC/AMDGPU/gfx12_asm_sop1.s | 9 +++++++++
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt | 9 +++++++++
4 files changed, 28 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 5e62ceac281b8..eeac9c1ad1084 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -423,6 +423,12 @@ let SubtargetPredicate = isGFX11Plus in {
}
} // End SubtargetPredicate = isGFX11Plus
+let SubtargetPredicate = isGFX12Plus in {
+ let hasSideEffects = 1, Defs = [SCC] in {
+ def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr">;
+ }
+} // End SubtargetPredicate = isGFX12Plus
+
class SOP1_F32_Inst<string opName, SDPatternOperator Op, ValueType vt0=f32,
ValueType vt1=vt0> :
SOP1_32<opName, [(set vt0:$sdst, (UniformUnaryFrag<Op> vt1:$src0))]>;
@@ -2048,6 +2054,7 @@ defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>;
defm S_BARRIER_SIGNAL_IMM : SOP1_IMM_Real_gfx12<0x04e>;
defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>;
defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>;
+defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>;
defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
index d5d10ce548c4d..cd30647fe6378 100644
--- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
@@ -322,6 +322,9 @@ image_sample_cd_o v252, v[1:4], s[8:15], s[12:15] dmask:0x1
image_sample_cd_o_g16 v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+s_alloc_vgpr s0
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
s_atomic_add flat_scratch_hi, s[2:3], s0
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
index 1e12e5bb48828..3828a4d36adcc 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
@@ -1,5 +1,14 @@
// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s
+s_alloc_vgpr 0x1235
+// GFX12: encoding: [0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00]
+
+s_alloc_vgpr 18
+// GFX12: encoding: [0x92,0x53,0x80,0xbe]
+
+s_alloc_vgpr s35
+// GFX12: encoding: [0x23,0x53,0x80,0xbe]
+
s_sleep_var 0x1234
// GFX12: encoding: [0xff,0x58,0x80,0xbe,0x34,0x12,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
index fa7d020bdd726..2ee84fa976d79 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
@@ -1,5 +1,14 @@
# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# GFX12: s_alloc_vgpr 0x1235 ; encoding: [0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00]
+0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00
+
+# GFX12: s_alloc_vgpr 18 ; encoding: [0x92,0x53,0x80,0xbe]
+0x92,0x53,0x80,0xbe
+
+# GFX12: s_alloc_vgpr s35 ; encoding: [0x23,0x53,0x80,0xbe]
+0x23,0x53,0x80,0xbe
+
# GFX12: s_sleep_var 0x1234 ; encoding: [0xff,0x58,0x80,0xbe,0x34,0x12,0x00,0x00]
0xff,0x58,0x80,0xbe,0x34,0x12,0x00,0x00
>From b2a7bdc3954d2bf72e99d730ce00159c2550f563 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Mon, 23 Oct 2023 10:36:31 +0200
Subject: [PATCH 02/12] [AMDGPU] Add SubtargetFeature for dynamic VGPR mode
This represents a hardware mode supported only for wave32 compute
shaders. When enabled, we set the `.dynamic_vgpr_en` field of
`.compute_registers` to true in the PAL metadata.
---
llvm/docs/AMDGPUUsage.rst | 6 ++++++
llvm/lib/Target/AMDGPU/AMDGPU.td | 6 ++++++
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 3 +++
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +++
llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll | 13 ++++++++-----
5 files changed, 26 insertions(+), 5 deletions(-)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index def6addd595e8..59cc08a59ed7c 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -758,6 +758,12 @@ For example:
enabled will execute correctly but may be less
performant than code generated for XNACK replay
disabled.
+
+ dynamic-vgpr TODO Represents the "Dynamic VGPR" hardware mode, introduced in GFX12.
+ Waves launched in this mode may allocate or deallocate the VGPRs
+ using dedicated instructions, but may not send the DEALLOC_VGPRS
+ message.
+
=============== ============================ ==================================================
.. _amdgpu-target-id:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index effc8d2ed6b49..31a98ee132bf6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1239,6 +1239,12 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
>;
+def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
+ "DynamicVGPR",
+ "true",
+ "Enable dynamic VGPR mode"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 31e0bd8d652bc..13e61756e3036 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1414,6 +1414,9 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
MD->setHwStage(CC, ".trap_present",
(bool)CurrentProgramInfo.TrapHandlerEnable);
MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
+
+ if (ST.isDynamicVGPREnabled())
+ MD->setComputeRegisters(".dynamic_vgpr_en", true);
}
MD->setHwStage(CC, ".lds_size",
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 6664a70572ded..1254cbad83b60 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -190,6 +190,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// indicates a lack of S_CLAUSE support.
unsigned MaxHardClauseLength = 0;
bool SupportsSRAMECC = false;
+ bool DynamicVGPR = false;
// This should not be used directly. 'TargetID' tracks the dynamic settings
// for SRAMECC.
@@ -1647,6 +1648,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return true;
}
+ bool isDynamicVGPREnabled() const { return DynamicVGPR; }
+
bool requiresDisjointEarlyClobberAndUndef() const override {
// AMDGPU doesn't care if early-clobber and undef operands are allocated
// to the same register.
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index 7536e83a9da6b..fa22089978c2e 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -1,4 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr <%s | FileCheck %s --check-prefixes=CHECK,DVGPR
; CHECK-LABEL: {{^}}_amdgpu_cs_main:
; CHECK: ; TotalNumSgprs: 4
@@ -8,6 +10,7 @@
; CHECK-NEXT: amdpal.pipelines:
; CHECK-NEXT: - .api: Vulkan
; CHECK-NEXT: .compute_registers:
+; DVGPR-NEXT: .dynamic_vgpr_en: true
; CHECK-NEXT: .tg_size_en: true
; CHECK-NEXT: .tgid_x_en: false
; CHECK-NEXT: .tgid_y_en: false
@@ -57,7 +60,7 @@
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
-; CHECK-NEXT: .ieee_mode: false
+; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: true
@@ -112,7 +115,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_gs
; CHECK-NEXT: .entry_point_symbol: gs_shader
-; CHECK-NEXT: .ieee_mode: false
+; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0x200
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
@@ -124,7 +127,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_hs
; CHECK-NEXT: .entry_point_symbol: hs_shader
-; CHECK-NEXT: .ieee_mode: false
+; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0x1000
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
@@ -136,7 +139,7 @@
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_ps
; CHECK-NEXT: .entry_point_symbol: ps_shader
-; CHECK-NEXT: .ieee_mode: false
+; GFX11-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
>From c29d8202c06488a9466aea49dda4cf2b4663236e Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Mon, 23 Oct 2023 11:46:19 +0200
Subject: [PATCH 03/12] [AMDGPU] Deallocate VGPRs before exiting in dynamic
VGPR mode
In dynamic VGPR mode, Waves must deallocate all VGPRs before exiting. If
the shader program does not do this, hardware inserts `S_ALLOC_VGPR 0`
before S_ENDPGM, but this may incur some performance cost. Therefore
it's better if the compiler proactively generates that instruction.
This patch extends `si-insert-waitcnts` to deallocate the VGPRs via
a `S_ALLOC_VGPR 0` before any `S_ENDPGM` when in dynamic VGPR mode.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 60 +--
.../CodeGen/AMDGPU/release-vgprs-gfx12.mir | 356 ++++++++++++++++++
2 files changed, 393 insertions(+), 23 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7e6bce2bf5f12..42ef23e836a58 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1647,17 +1647,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
}
- // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
- // stores. In this case it can be useful to send a message to explicitly
- // release all VGPRs before the stores have completed, but it is only safe to
- // do this if:
- // * there are no outstanding scratch stores
- // * we are not in Dynamic VGPR mode
+ // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
+ // Technically the hardware will do this on its own if we don't, but that
+ // might cost extra cycles compared to doing it explicitly.
+ // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
+ // have to wait for outstanding VMEM stores. In this case it can be useful to
+ // send a message to explicitly release all VGPRs before the stores have
+ // completed, but it is only safe to do this if there are no outstanding
+ // scratch stores.
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
- if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
- ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
- !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
+ if (!WCG->isOptNone() &&
+ (ST->isDynamicVGPREnabled() ||
+ (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+ ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
+ !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
ReleaseVGPRInsts.insert(&MI);
}
// Resolve vm waits before gs-done.
@@ -2610,26 +2614,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
}
- // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
- // instructions.
+ // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
+ // This is done in different ways depending on how the VGPRs were allocated
+ // (i.e. whether we're in dynamic VGPR mode or not).
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
// waveslot limited kernel runs slower with the deallocation.
- if (!ReleaseVGPRInsts.empty() &&
- (MF.getFrameInfo().hasCalls() ||
- ST->getOccupancyWithNumVGPRs(
- TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
- AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+ if (ST->isDynamicVGPREnabled()) {
for (MachineInstr *MI : ReleaseVGPRInsts) {
- if (ST->requiresNopBeforeDeallocVGPRs()) {
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::S_NOP))
- .addImm(0);
- }
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::S_SENDMSG))
- .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+ TII->get(AMDGPU::S_ALLOC_VGPR))
+ .addImm(0);
Modified = true;
}
+ } else {
+ if (!ReleaseVGPRInsts.empty() &&
+ (MF.getFrameInfo().hasCalls() ||
+ ST->getOccupancyWithNumVGPRs(
+ TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
+ AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+ for (MachineInstr *MI : ReleaseVGPRInsts) {
+ if (ST->requiresNopBeforeDeallocVGPRs()) {
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_NOP))
+ .addImm(0);
+ }
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_SENDMSG))
+ .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+ Modified = true;
+ }
+ }
}
ReleaseVGPRInsts.clear();
PreheadersToFlush.clear();
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
new file mode 100644
index 0000000000000..884b5f8b6f018
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
@@ -0,0 +1,356 @@
+# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT
+# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
+
+--- |
+ define amdgpu_ps void @tbuffer_store1() { ret void }
+ define amdgpu_ps void @tbuffer_store2() { ret void }
+ define amdgpu_ps void @flat_store() { ret void }
+ define amdgpu_ps void @global_store() { ret void }
+ define amdgpu_ps void @buffer_store_format() { ret void }
+ define amdgpu_ps void @ds_write_b32() { ret void }
+ define amdgpu_ps void @global_store_dword() { ret void }
+ define amdgpu_ps void @multiple_basic_blocks1() { ret void }
+ define amdgpu_ps void @multiple_basic_blocks2() { ret void }
+ define amdgpu_ps void @multiple_basic_blocks3() { ret void }
+ define amdgpu_ps void @recursive_loop() { ret void }
+ define amdgpu_ps void @recursive_loop_vmem() { ret void }
+ define amdgpu_ps void @image_store() { ret void }
+ define amdgpu_ps void @scratch_store() { ret void }
+ define amdgpu_ps void @buffer_atomic() { ret void }
+ define amdgpu_ps void @flat_atomic() { ret void }
+ define amdgpu_ps void @global_atomic() { ret void }
+ define amdgpu_ps void @image_atomic() { ret void }
+ define amdgpu_ps void @global_store_optnone() noinline optnone { ret void }
+...
+
+---
+name: tbuffer_store1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: tbuffer_store1
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: tbuffer_store2
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: tbuffer_store2
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: flat_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: flat_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: global_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: global_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+ S_WAIT_STORECNT 0
+ S_ENDPGM 0
+...
+
+---
+name: buffer_store_format
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: buffer_store_format
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: ds_write_b32
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: ds_write_b32
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0 = IMPLICIT_DEF
+ renamable $vgpr1 = IMPLICIT_DEF
+ DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec, implicit $m0
+ S_ENDPGM 0
+
+...
+---
+name: global_store_dword
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: global_store_dword
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: multiple_basic_blocks1
+body: |
+ ; CHECK-LABEL: name: multiple_basic_blocks1
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+
+---
+name: multiple_basic_blocks2
+body: |
+ ; CHECK-LABEL: name: multiple_basic_blocks2
+ ; CHECK: bb.2:
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.2
+
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ successors: %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: multiple_basic_blocks3
+body: |
+ ; CHECK-LABEL: name: multiple_basic_blocks3
+ ; CHECK: bb.4:
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ successors: %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.4
+
+ S_BRANCH %bb.4
+
+ bb.3:
+ successors: %bb.4
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_ENDPGM 0
+...
+
+---
+name: recursive_loop
+body: |
+ ; CHECK-LABEL: name: recursive_loop
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: recursive_loop_vmem
+body: |
+ ; CHECK-LABEL: name: recursive_loop_vmem
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
+ S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: image_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: image_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: scratch_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: scratch_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
+ SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: buffer_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: buffer_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: flat_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: flat_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+
+---
+name: global_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: global_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: image_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: image_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: global_store_optnone
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: global_store_optnone
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK-NOT: S_ALLOC_VGPR
+ ; CHECK: S_ENDPGM 0
+ GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+ S_WAIT_STORECNT 0
+ S_ENDPGM 0
+...
>From d3ceb4ebd008183980b97d0887f9b2bdb30b7c6f Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Mon, 5 Feb 2024 13:48:16 +0100
Subject: [PATCH 04/12] [AMDGPU] Update target helpers & GCNSchedStrategy for
dynamic VGPRs
In dynamic VGPR mode, we can allocate up to 8 blocks of either 16 or 32
VGPRs (based on a chip-wide setting which we can model with a Subtarget
feature). Update some of the subtarget helpers to reflect this.
In particular:
- getVGPRAllocGranule is set to the block size
- getAddresableNumVGPR will limit itself to 8 * size of a block
We also try to be more careful about how many VGPR blocks we allocate.
Therefore, when deciding if we should revert scheduling after a given
stage, we check that we haven't increased the number of VGPR blocks that
need to be allocated.
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 6 ++
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 10 +++
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 +
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 6 ++
.../Target/AMDGPU/AMDGPUUnitTests.cpp | 62 +++++++++++++++++++
llvm/unittests/Target/AMDGPU/CMakeLists.txt | 1 +
6 files changed, 86 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 31a98ee132bf6..339eeec72da46 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1245,6 +1245,12 @@ def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
"Enable dynamic VGPR mode"
>;
+def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
+ "DynamicVGPRBlockSize32",
+ "true",
+ "Use a block size of 32 for dynamic VGPR allocation (default is 16)"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index c277223de13ac..4cc71f321f8f2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1452,6 +1452,16 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
if (WavesAfter < DAG.MinOccupancy)
return true;
+ // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
+ if (ST.isDynamicVGPREnabled()) {
+ unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
+ &ST, PressureBefore.getVGPRNum(false));
+ unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
+ &ST, PressureAfter.getVGPRNum(false));
+ if (BlocksAfter > BlocksBefore)
+ return true;
+ }
+
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 1254cbad83b60..9ccf38fb4dbbe 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -191,6 +191,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
unsigned MaxHardClauseLength = 0;
bool SupportsSRAMECC = false;
bool DynamicVGPR = false;
+ bool DynamicVGPRBlockSize32 = false;
// This should not be used directly. 'TargetID' tracks the dynamic settings
// for SRAMECC.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b51cf536467b9..bebbb0dde0b9b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1154,6 +1154,9 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 8;
+ if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+ return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
+
bool IsWave32 = EnableWavefrontSize32 ?
*EnableWavefrontSize32 :
STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -1195,6 +1198,9 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 512;
+ if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+ // On GFX12 we can allocate at most 8 blocks of VGPRs.
+ return 8 * getVGPRAllocGranule(STI);
return getAddressableNumArchVGPRs(STI);
}
diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
index 8fbd470815b79..21f45443281e7 100644
--- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
+++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
@@ -152,6 +152,24 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
EXPECT_TRUE(ErrStr.empty()) << ErrStr;
}
+static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
+ TestFuncTy test) {
+ auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName,
+ "+dynamic-vgpr," + FS.str());
+ ASSERT_TRUE(TM) << "No target machine";
+
+ GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+ std::string(TM->getTargetFeatureString()), *TM);
+ ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));
+
+ std::stringstream Table;
+ bool Success = testAndRecord(Table, ST, test);
+ EXPECT_TRUE(Success && !PrintCpuRegLimits)
+ << CPUName << " dynamic VGPR " << FS
+ << ":\nOcc MinVGPR MaxVGPR\n"
+ << Table.str() << '\n';
+}
+
TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
@@ -163,6 +181,50 @@ TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
};
testGPRLimits("VGPR", true, test);
+
+ testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test);
+ testDynamicVGPRLimits("gfx1200",
+ "+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
+}
+
+static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
+ unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
+ unsigned ExpectedMaxVGPRs) {
+ auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
+ ASSERT_TRUE(TM) << "No target machine";
+
+ GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+ std::string(TM->getTargetFeatureString()), *TM);
+
+ // Test function without attributes.
+ LLVMContext Context;
+ Module M("", Context);
+ Function *Func =
+ Function::Create(FunctionType::get(Type::getVoidTy(Context), false),
+ GlobalValue::ExternalLinkage, "testFunc", &M);
+ Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
+ Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");
+
+ auto Range = ST.getWavesPerEU(*Func);
+ EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
+ EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
+ EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
+ EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs())
+ << CPUName << ' ' << FS;
+
+ // Function with requested 'amdgpu-waves-per-eu' in a valid range.
+ Func->addFnAttr("amdgpu-waves-per-eu", "10,12");
+ Range = ST.getWavesPerEU(*Func);
+ EXPECT_EQ(10u, Range.first) << CPUName << ' ' << FS;
+ EXPECT_EQ(12u, Range.second) << CPUName << ' ' << FS;
+}
+
+TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
+ testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256);
+ testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128);
+ testAbsoluteLimits(
+ "gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32",
+ 1, 16, 256);
}
static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) {
diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
index ca8f48bc393ef..6d6f17883a07e 100644
--- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
@@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS
Core
GlobalISel
MC
+ MIRParser
Support
TargetParser
)
>From 3e20edfc6f3b1bfa60f5d778ce98c1fb984b1aee Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Tue, 24 Sep 2024 09:57:25 +0200
Subject: [PATCH 05/12] [AMDGPU] Allocate scratch space for dVGPRs for CWSR
The CWSR trap handler needs to save and restore the VGPRs. When dynamic
VGPRs are in use, the fixed function hardware will only allocate enough
space for one VGPR block. The rest will have to be stored in scratch, at
offset 0.
This patch allocates the necessary space by:
- generating a prologue that checks at runtime if we're on a compute
queue (since CWSR only works on compute queues); for this we will have
to check the ME_ID bits of the ID_HW_ID2 register - if that is
non-zero, we can assume we're on a compute queue and initialize the SP
and FP with enough room for the dynamic VGPRs
- forcing all compute entry functions to use a FP so they can access
their locals/spills correctly (this isn't ideal but it's the quickest
to implement)
Note that at the moment we allocate enough space for the theoretical
maximum number of VGPRs that can be allocated dynamically (for blocks of
16 registers, this will be 128, of which we subtract the first 16, which
are already allocated by the fixed function hardware). Future patches
may decide to allocate less if they can prove the shader never allocates
that many blocks.
Also note that this should not affect any reported stack sizes (e.g. PAL
backend_stack_size etc).
---
llvm/docs/AMDGPUUsage.rst | 65 +++--
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 9 +-
llvm/lib/Target/AMDGPU/SIDefines.h | 1 +
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 66 ++++-
llvm/lib/Target/AMDGPU/SIFrameLowering.h | 4 +
.../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 13 +
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 8 +
.../dynamic-vgpr-reserve-stack-for-cwsr.ll | 263 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll | 11 +-
9 files changed, 399 insertions(+), 41 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 59cc08a59ed7c..b5196930a50f7 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -6020,8 +6020,13 @@ Frame Pointer
If the kernel needs a frame pointer for the reasons defined in
``SIFrameLowering`` then SGPR33 is used and is always set to ``0`` in the
-kernel prolog. If a frame pointer is not required then all uses of the frame
-pointer are replaced with immediate ``0`` offsets.
+kernel prolog. On GFX12+, when dynamic VGPRs are enabled, the prologue will
+check if the kernel is running on a compute queue, and if so it will reserve
+some scratch space for any dynamic VGPRs that might need to be saved by the
+CWSR trap handler. In this case, the frame pointer will be initialized to
+a suitably aligned offset above this reserved area. If a frame pointer is not
+required then all uses of the frame pointer are replaced with immediate ``0``
+offsets.
.. _amdgpu-amdhsa-kernel-prolog-flat-scratch:
@@ -17133,33 +17138,35 @@ within a map that has been added by the same *vendor-name*.
.. table:: AMDPAL Code Object Hardware Stage Metadata Map
:name: amdgpu-amdpal-code-object-hardware-stage-metadata-map-table
- ========================== ============== ========= ===============================================================
- String Key Value Type Required? Description
- ========================== ============== ========= ===============================================================
- ".entry_point" string The ELF symbol pointing to this pipeline's stage entry point.
- ".scratch_memory_size" integer Scratch memory size in bytes.
- ".lds_size" integer Local Data Share size in bytes.
- ".perf_data_buffer_size" integer Performance data buffer size in bytes.
- ".vgpr_count" integer Number of VGPRs used.
- ".agpr_count" integer Number of AGPRs used.
- ".sgpr_count" integer Number of SGPRs used.
- ".vgpr_limit" integer If non-zero, indicates the shader was compiled with a
- directive to instruct the compiler to limit the VGPR usage to
- be less than or equal to the specified value (only set if
- different from HW default).
- ".sgpr_limit" integer SGPR count upper limit (only set if different from HW
- default).
- ".threadgroup_dimensions" sequence of Thread-group X/Y/Z dimensions (Compute only).
- 3 integers
- ".wavefront_size" integer Wavefront size (only set if different from HW default).
- ".uses_uavs" boolean The shader reads or writes UAVs.
- ".uses_rovs" boolean The shader reads or writes ROVs.
- ".writes_uavs" boolean The shader writes to one or more UAVs.
- ".writes_depth" boolean The shader writes out a depth value.
- ".uses_append_consume" boolean The shader uses append and/or consume operations, either
- memory or GDS.
- ".uses_prim_id" boolean The shader uses PrimID.
- ========================== ============== ========= ===============================================================
+ =========================== ============== ========= ===============================================================
+ String Key Value Type Required? Description
+ =========================== ============== ========= ===============================================================
+ ".entry_point" string The ELF symbol pointing to this pipeline's stage entry point.
+ ".scratch_memory_size" integer Scratch memory size in bytes.
+ ".lds_size" integer Local Data Share size in bytes.
+ ".perf_data_buffer_size" integer Performance data buffer size in bytes.
+ ".vgpr_count" integer Number of VGPRs used.
+ ".agpr_count" integer Number of AGPRs used.
+ ".sgpr_count" integer Number of SGPRs used.
+ ".dynamic_vgpr_saved_count" integer No Number of dynamic VGPRs that can be stored in scratch by the
+ CWSR trap handler. Only used on GFX12+.
+ ".vgpr_limit" integer If non-zero, indicates the shader was compiled with a
+ directive to instruct the compiler to limit the VGPR usage to
+ be less than or equal to the specified value (only set if
+ different from HW default).
+ ".sgpr_limit" integer SGPR count upper limit (only set if different from HW
+ default).
+ ".threadgroup_dimensions" sequence of Thread-group X/Y/Z dimensions (Compute only).
+ 3 integers
+ ".wavefront_size" integer Wavefront size (only set if different from HW default).
+ ".uses_uavs" boolean The shader reads or writes UAVs.
+ ".uses_rovs" boolean The shader reads or writes ROVs.
+ ".writes_uavs" boolean The shader writes to one or more UAVs.
+ ".writes_depth" boolean The shader writes out a depth value.
+ ".uses_append_consume" boolean The shader uses append and/or consume operations, either
+ memory or GDS.
+ ".uses_prim_id" boolean The shader uses PrimID.
+ =========================== ============== ========= ===============================================================
..
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 13e61756e3036..73c97a25f4d0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1439,8 +1439,15 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
MD->setEntryPoint(CC, MF.getFunction().getName());
MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
- // Only set AGPRs for supported devices
+ // For targets that support dynamic VGPRs, set the number of saved dynamic
+ // VGPRs (if any) in the PAL metadata.
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ if (STM.isDynamicVGPREnabled() &&
+ MFI->getScratchReservedForDynamicVGPRs() > 0)
+ MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
+ MFI->getScratchReservedForDynamicVGPRs() / 4);
+
+ // Only set AGPRs for supported devices
if (STM.hasMAIInsts()) {
MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 721601efcc804..8f9d099b25857 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -552,6 +552,7 @@ enum Id { // HwRegCode, (6) [5:0]
enum Offset : unsigned { // Offset, (5) [10:6]
OFFSET_MEM_VIOL = 8,
+ OFFSET_ME_ID = 8,
};
enum ModeRegisterMasks : uint32_t {
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 97736e2410c18..430d1824ef464 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -691,17 +691,61 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
- if (hasFP(MF)) {
+ unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST);
+ if (!mayReserveScratchForCWSR(MF)) {
+ if (hasFP(MF)) {
+ Register FPReg = MFI->getFrameOffsetReg();
+ assert(FPReg != AMDGPU::FP_REG);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
+ }
+
+ if (requiresStackPointerReference(MF)) {
+ Register SPReg = MFI->getStackPtrOffsetReg();
+ assert(SPReg != AMDGPU::SP_REG);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset);
+ }
+ } else {
+ // We need to check if we're on a compute queue - if we are, then the CWSR
+ // trap handler may need to store some VGPRs on the stack. The first VGPR
+ // block is saved separately, so we only need to allocate space for any
+ // additional VGPR blocks used. For now, we will make sure there's enough
+ // room for the theoretical maximum number of VGPRs that can be allocated.
+ // FIXME: Figure out if the shader uses fewer VGPRs in practice.
+ assert(hasFP(MF));
Register FPReg = MFI->getFrameOffsetReg();
assert(FPReg != AMDGPU::FP_REG);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
- }
-
- if (requiresStackPointerReference(MF)) {
Register SPReg = MFI->getStackPtrOffsetReg();
assert(SPReg != AMDGPU::SP_REG);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
- .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
+ unsigned VGPRSize =
+ llvm::alignTo((ST.getAddressableNumVGPRs() -
+ AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) *
+ 4,
+ FrameInfo.getMaxAlign());
+ MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
+
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
+ .addImm(AMDGPU::Hwreg::HwregEncoding::encode(
+ AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 1));
+ // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
+ // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
+ // SCC, so we need to check for 0 manually.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize);
+ if (requiresStackPointerReference(MF)) {
+ // If at least one of the constants can be inlined, then we can use
+ // s_cselect. Otherwise, use a mov and cmovk.
+ if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm()) ||
+ AMDGPU::isInlinableLiteral32(Offset + VGPRSize,
+ ST.hasInv2PiInlineImm())) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CSELECT_B32), SPReg)
+ .addImm(Offset + VGPRSize)
+ .addImm(Offset);
+ } else {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), SPReg)
+ .addImm(Offset + VGPRSize);
+ }
+ }
}
bool NeedsFlatScratchInit =
@@ -1831,9 +1875,17 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
MF) ||
+ mayReserveScratchForCWSR(MF) ||
MF.getTarget().Options.DisableFramePointerElim(MF);
}
+bool SIFrameLowering::mayReserveScratchForCWSR(
+ const MachineFunction &MF) const {
+ return MF.getSubtarget<GCNSubtarget>().isDynamicVGPREnabled() &&
+ AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) &&
+ AMDGPU::isCompute(MF.getFunction().getCallingConv());
+}
+
// This is essentially a reduced version of hasFP for entry functions. Since the
// stack pointer is known 0 on entry to kernels, we never really need an FP
// register. We may need to initialize the stack pointer depending on the frame
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 938c75099a3bc..9dac4bc8951e5 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -86,6 +86,10 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
public:
bool requiresStackPointerReference(const MachineFunction &MF) const;
+
+ // Returns true if the function may need to reserve space on the stack for the
+ // CWSR trap handler.
+ bool mayReserveScratchForCWSR(const MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 740f752bc93b7..6d75b83ea2223 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -455,6 +455,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
unsigned NumSpilledSGPRs = 0;
unsigned NumSpilledVGPRs = 0;
+ // The size of the scratch space reserved for the CWSR trap handler to spill
+ // some of the dynamic VGPRs.
+ unsigned ScratchReservedForDynamicVGPRs = 0;
+
// Tracks information about user SGPRs that will be setup by hardware which
// will apply to all wavefronts of the grid.
GCNUserSGPRUsageInfo UserSGPRInfo;
@@ -780,6 +784,15 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
BytesInStackArgArea = Bytes;
}
+ // This is only used if we need to save any dynamic VGPRs in scratch.
+ unsigned getScratchReservedForDynamicVGPRs() const {
+ return ScratchReservedForDynamicVGPRs;
+ }
+
+ void setScratchReservedForDynamicVGPRs(unsigned Size) {
+ ScratchReservedForDynamicVGPRs = Size;
+ }
+
// Add user SGPRs.
Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
Register addDispatchPtr(const SIRegisterInfo &TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index adadf8e4e4e65..4c6d5f2d459f7 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -511,6 +511,14 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const SIFrameLowering *TFI = ST.getFrameLowering();
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+
+ // If we need to reserve scratch space for saving the VGPRs, then we should
+ // use the frame register for accessing our own frame (which may start at a
+ // non-zero offset).
+ if (TFI->mayReserveScratchForCWSR(MF))
+ return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
+ : FuncInfo->getStackPtrOffsetReg();
+
// During ISel lowering we always reserve the stack pointer in entry and chain
// functions, but never actually want to reference it when accessing our own
// frame. If we need a frame pointer we use it, but otherwise we can just use
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
new file mode 100644
index 0000000000000..d420af4ca100c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefix=CHECK %s
+
+; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
+
+define amdgpu_cs void @amdgpu_cs() #0 {
+; CHECK-LABEL: amdgpu_cs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ ret void
+}
+
+define amdgpu_kernel void @kernel() #0 {
+; CHECK-LABEL: kernel:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ ret void
+}
+
+define amdgpu_cs void @with_local() #0 {
+; CHECK-LABEL: with_local:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: v_mov_b32_e32 v0, 13
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %local = alloca i32, addrspace(5)
+ store volatile i8 13, ptr addrspace(5) %local
+ ret void
+}
+
+; Check that we generate s_cselect for SP if we can fit
+; the offset in an inline constant.
+define amdgpu_cs void @with_calls_inline_const() #0 {
+; CHECK-LABEL: with_calls_inline_const:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: v_mov_b32_e32 v0, 15
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: s_mov_b32 s0, callee at abs32@lo
+; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT: s_cselect_b32 s32, 0x1d0, 16
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %local = alloca i32, addrspace(5)
+ store volatile i8 15, ptr addrspace(5) %local
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+; Check that we generate s_mov + s_cmovk if we can't
+; fit the offset for SP in an inline constant.
+define amdgpu_cs void @with_calls_no_inline_const() #0 {
+; CHECK-LABEL: with_calls_no_inline_const:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: v_mov_b32_e32 v0, 15
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: s_mov_b32 s0, callee at abs32@lo
+; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT: s_movk_i32 s32, 0x100
+; CHECK-NEXT: s_cmovk_i32 s32, 0x2c0
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %local = alloca i32, i32 61, addrspace(5)
+ store volatile i8 15, ptr addrspace(5) %local
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+; We're going to limit this to 16 VGPRs, so we need to spill the rest.
+define amdgpu_cs void @with_spills(ptr addrspace(1) %p1, ptr addrspace(1) %p2) #1 {
+; CHECK-LABEL: with_spills:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:96
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:80 ; 16-byte Folded Spill
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_b128 v[8:11], v[0:1], off offset:112
+; CHECK-NEXT: global_load_b128 v[12:15], v[0:1], off offset:64
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:80
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:64 ; 16-byte Folded Spill
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:32
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:48 ; 16-byte Folded Spill
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:48
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:32 ; 16-byte Folded Spill
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16 ; 16-byte Folded Spill
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 ; 16-byte Folded Spill
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:96
+; CHECK-NEXT: global_store_b128 v[2:3], v[8:11], off offset:112
+; CHECK-NEXT: global_store_b128 v[2:3], v[12:15], off offset:64
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:64 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:80
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:48 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:32
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:32 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:48
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:16 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:16
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %v = load <32 x i32>, ptr addrspace(1) %p1
+ store <32 x i32> %v, ptr addrspace(1) %p2
+ ret void
+}
+
+define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
+; CHECK-LABEL: realign_stack:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_mov_b32 s0, callee at abs32@lo
+; CHECK-NEXT: s_cmovk_i32 s33, 0x200
+; CHECK-NEXT: s_movk_i32 s32, 0x100
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112
+; CHECK-NEXT: scratch_store_b128 off, v[24:27], s33 offset:96
+; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80
+; CHECK-NEXT: scratch_store_b128 off, v[16:19], s33 offset:64
+; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48
+; CHECK-NEXT: scratch_store_b128 off, v[8:11], s33 offset:32
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16
+; CHECK-NEXT: scratch_store_b128 off, v[0:3], s33
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT: s_cmovk_i32 s32, 0x300
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %v = alloca <32 x i32>, align 128, addrspace(5)
+ store <32 x i32> %x, ptr addrspace(5) %v
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+; Non-entry functions and graphics shaders can't run on a compute queue,
+; so they don't need to worry about CWSR.
+define amdgpu_gs void @amdgpu_gs() #0 {
+; CHECK-LABEL: amdgpu_gs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v0, 15
+; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
+; CHECK-NEXT: s_mov_b32 s0, callee at abs32@lo
+; CHECK-NEXT: s_mov_b32 s32, 16
+; CHECK-NEXT: scratch_store_b8 off, v0, off scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %local = alloca i32, addrspace(5)
+ store volatile i8 15, ptr addrspace(5) %local
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+define amdgpu_gfx void @amdgpu_gfx() #0 {
+; CHECK-LABEL: amdgpu_gfx:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_expcnt 0x0
+; CHECK-NEXT: s_wait_samplecnt 0x0
+; CHECK-NEXT: s_wait_bvhcnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: s_mov_b32 s0, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b32 s1, -1
+; CHECK-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_mov_b32 exec_lo, s1
+; CHECK-NEXT: v_writelane_b32 v40, s0, 2
+; CHECK-NEXT: v_mov_b32_e32 v0, 15
+; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
+; CHECK-NEXT: s_mov_b32 s0, callee at abs32@lo
+; CHECK-NEXT: s_add_co_i32 s32, s32, 16
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: s_mov_b32 s32, s33
+; CHECK-NEXT: v_readlane_b32 s0, v40, 2
+; CHECK-NEXT: s_or_saveexec_b32 s1, -1
+; CHECK-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_mov_b32 exec_lo, s1
+; CHECK-NEXT: s_mov_b32 s33, s0
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %local = alloca i32, addrspace(5)
+ store volatile i8 15, ptr addrspace(5) %local
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+define void @default() #0 {
+; CHECK-LABEL: default:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_expcnt 0x0
+; CHECK-NEXT: s_wait_samplecnt 0x0
+; CHECK-NEXT: s_wait_bvhcnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ ret void
+}
+
+declare amdgpu_gfx void @callee(i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-num-vgpr"="16"}
+
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index fa22089978c2e..5748f6b188acf 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -1,9 +1,10 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11,NODVGPR
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK,NODVGPR
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr <%s | FileCheck %s --check-prefixes=CHECK,DVGPR
; CHECK-LABEL: {{^}}_amdgpu_cs_main:
-; CHECK: ; TotalNumSgprs: 4
+; NODVGPR: ; TotalNumSgprs: 4
+; DVGPR: ; TotalNumSgprs: 34
; CHECK: ; NumVgprs: 2
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
@@ -56,6 +57,7 @@
; CHECK-NEXT: .cs:
; CHECK-NEXT: .checksum_value: 0x9444d7d0
; CHECK-NEXT: .debug_mode: false
+; DVGPR-NEXT: .dynamic_vgpr_saved_count: 0x70
; CHECK-NEXT: .entry_point: _amdgpu_cs
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
; CHECK-NEXT: .excp_en: 0
@@ -66,7 +68,8 @@
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
-; CHECK-NEXT: .sgpr_count: 0x4
+; NODVGPR-NEXT: .sgpr_count: 0x4
+; DVGPR-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .sgpr_limit: 0x6a
; CHECK-NEXT: .threadgroup_dimensions:
; CHECK-NEXT: - 0x1
>From 11bc3ed97cc92ea922733868950a07e9b12b3694 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 7 Mar 2025 11:59:30 +0100
Subject: [PATCH 06/12] Fix num bits
---
llvm/lib/Target/AMDGPU/SIDefines.h | 2 +-
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 2 +-
.../AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll | 14 +++++++-------
3 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 8f9d099b25857..8e811b43a4532 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -552,7 +552,7 @@ enum Id { // HwRegCode, (6) [5:0]
enum Offset : unsigned { // Offset, (5) [10:6]
OFFSET_MEM_VIOL = 8,
- OFFSET_ME_ID = 8,
+ OFFSET_ME_ID = 8, // in HW_ID2
};
enum ModeRegisterMasks : uint32_t {
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 430d1824ef464..7f6020fc5b952 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -725,7 +725,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
.addImm(AMDGPU::Hwreg::HwregEncoding::encode(
- AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 1));
+ AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
// The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
// (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
// SCC, so we need to check for 0 manually.
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index d420af4ca100c..fb4aab836254f 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -6,7 +6,7 @@
define amdgpu_cs void @amdgpu_cs() #0 {
; CHECK-LABEL: amdgpu_cs:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_cmp_lg_u32 0, s33
; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
@@ -18,7 +18,7 @@ define amdgpu_cs void @amdgpu_cs() #0 {
define amdgpu_kernel void @kernel() #0 {
; CHECK-LABEL: kernel:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_cmp_lg_u32 0, s33
; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
@@ -30,7 +30,7 @@ define amdgpu_kernel void @kernel() #0 {
define amdgpu_cs void @with_local() #0 {
; CHECK-LABEL: with_local:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
; CHECK-NEXT: v_mov_b32_e32 v0, 13
; CHECK-NEXT: s_cmp_lg_u32 0, s33
; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
@@ -48,7 +48,7 @@ define amdgpu_cs void @with_local() #0 {
define amdgpu_cs void @with_calls_inline_const() #0 {
; CHECK-LABEL: with_calls_inline_const:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
; CHECK-NEXT: v_mov_b32_e32 v0, 15
; CHECK-NEXT: s_cmp_lg_u32 0, s33
; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
@@ -72,7 +72,7 @@ define amdgpu_cs void @with_calls_inline_const() #0 {
define amdgpu_cs void @with_calls_no_inline_const() #0 {
; CHECK-LABEL: with_calls_no_inline_const:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
; CHECK-NEXT: v_mov_b32_e32 v0, 15
; CHECK-NEXT: s_cmp_lg_u32 0, s33
; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
@@ -96,7 +96,7 @@ define amdgpu_cs void @with_calls_no_inline_const() #0 {
define amdgpu_cs void @with_spills(ptr addrspace(1) %p1, ptr addrspace(1) %p2) #1 {
; CHECK-LABEL: with_spills:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:96
; CHECK-NEXT: s_cmp_lg_u32 0, s33
; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
@@ -151,7 +151,7 @@ define amdgpu_cs void @with_spills(ptr addrspace(1) %p1, ptr addrspace(1) %p2) #
define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
; CHECK-LABEL: realign_stack:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
; CHECK-NEXT: s_cmp_lg_u32 0, s33
; CHECK-NEXT: s_mov_b32 s0, callee at abs32@lo
>From 618c897685dae02eb13416cb480d276d90cf1ce9 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 7 Mar 2025 14:12:17 +0100
Subject: [PATCH 07/12] Add new test
---
.../AMDGPU/machine-function-info-cwsr.ll | 72 +++++++++++++++++++
1 file changed, 72 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
diff --git a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
new file mode 100644
index 0000000000000..2de6699aab665
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
+
+; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
+
+define amdgpu_cs void @amdgpu_cs() #0 {
+; CHECK-LABEL: {{^}}name: amdgpu_cs
+; CHECK: scratchReservedForDynamicVGPRs: 448
+ ret void
+}
+
+define amdgpu_kernel void @kernel() #0 {
+; CHECK-LABEL: {{^}}name: kernel
+; CHECK: scratchReservedForDynamicVGPRs: 448
+ ret void
+}
+
+define amdgpu_cs void @with_local() #0 {
+; CHECK-LABEL: {{^}}name: with_local
+; CHECK: scratchReservedForDynamicVGPRs: 448
+ %local = alloca i32, addrspace(5)
+ store volatile i8 13, ptr addrspace(5) %local
+ ret void
+}
+
+define amdgpu_cs void @with_calls() #0 {
+; CHECK-LABEL: {{^}}name: with_calls
+; CHECK: scratchReservedForDynamicVGPRs: 448
+ %local = alloca i32, addrspace(5)
+ store volatile i8 15, ptr addrspace(5) %local
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
+; CHECK-LABEL: {{^}}name: realign_stack
+; CHECK: scratchReservedForDynamicVGPRs: 512
+ %v = alloca <32 x i32>, align 128, addrspace(5)
+ store <32 x i32> %x, ptr addrspace(5) %v
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+; Non-entry functions and graphics shaders can't run on a compute queue,
+; so they don't need to worry about CWSR.
+define amdgpu_gs void @amdgpu_gs() #0 {
+; CHECK-LABEL: {{^}}name: amdgpu_gs
+; CHECK: scratchReservedForDynamicVGPRs: 0
+ %local = alloca i32, addrspace(5)
+ store volatile i8 15, ptr addrspace(5) %local
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+define amdgpu_gfx void @amdgpu_gfx() #0 {
+; CHECK-LABEL: {{^}}name: amdgpu_gfx
+; CHECK: scratchReservedForDynamicVGPRs: 0
+ %local = alloca i32, addrspace(5)
+ store volatile i8 15, ptr addrspace(5) %local
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+define void @default() #0 {
+; CHECK-LABEL: {{^}}name: default
+; CHECK: scratchReservedForDynamicVGPRs: 0
+ ret void
+}
+
+declare amdgpu_gfx void @callee(i32) #0
+
+attributes #0 = { nounwind }
+
>From b9e300b0c18eafb97e23fdb7d2c3f4dc1bcfef75 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 7 Mar 2025 14:18:08 +0100
Subject: [PATCH 08/12] Serialize reserved scratch size
---
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 3 ++-
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 4 ++++
llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll | 2 ++
.../CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll | 1 +
.../MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll | 1 +
.../MIR/AMDGPU/machine-function-info-long-branch-reg.ll | 1 +
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir | 4 ++++
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll | 4 ++++
8 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index abd19c988a7eb..6f36cea33c13e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -713,7 +713,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
- Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()) {
+ Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
+ ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 6d75b83ea2223..e37fbc69b89b3 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -299,6 +299,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool HasInitWholeWave = false;
+ unsigned ScratchReservedForDynamicVGPRs = 0;
+
SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
const TargetRegisterInfo &TRI,
@@ -350,6 +352,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
StringValue());
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
+ YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
+ MFI.ScratchReservedForDynamicVGPRs, 0);
}
};
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index eb4ee118ec2e4..2bb31e926e39a 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -44,6 +44,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
entry:
@@ -311,6 +312,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
entry:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index 6f5467b00ebcc..a712cb5f7f3e3 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -44,6 +44,7 @@
; AFTER-PEI-NEXT: sgprForEXECCopy: ''
; AFTER-PEI-NEXT: longBranchReservedReg: ''
; AFTER-PEI-NEXT: hasInitWholeWave: false
+; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0
; AFTER-PEI-NEXT: body:
define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
%wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 883657547519b..ac9a96c784de7 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -44,6 +44,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
bb0:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 278bf086d6088..1ad13db352403 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -44,6 +44,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {
bb0:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 89d831b51f694..944b2aa4dc175 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -53,6 +53,7 @@
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
+# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@@ -158,6 +159,7 @@ body: |
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
+# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@@ -234,6 +236,7 @@ body: |
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
+# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@@ -311,6 +314,7 @@ body: |
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
+# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index ec56de11b250a..dfe3e33e8b3ec 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -54,6 +54,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
%gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0
@@ -101,6 +102,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
%gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
@@ -172,6 +174,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define void @function() {
ret void
@@ -225,6 +228,7 @@ define void @function() {
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define void @function_nsz() #0 {
ret void
>From 5622bec8015d8fe32193dace0de0f7e7f6cbb77c Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 7 Mar 2025 14:18:19 +0100
Subject: [PATCH 09/12] Reword comment
---
.../test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index fb4aab836254f..83349df1f18cd 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -177,8 +177,7 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
ret void
}
-; Non-entry functions and graphics shaders can't run on a compute queue,
-; so they don't need to worry about CWSR.
+; Non-entry functions and graphics shaders don't need to worry about CWSR.
define amdgpu_gs void @amdgpu_gs() #0 {
; CHECK-LABEL: amdgpu_gs:
; CHECK: ; %bb.0:
>From eda38703cd5599b2cae411e2873c8fdc36ed8ac9 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 7 Mar 2025 14:45:43 +0100
Subject: [PATCH 10/12] Remove amdgpu-num-vgpr
---
.../dynamic-vgpr-reserve-stack-for-cwsr.ll | 51 ++-----------------
1 file changed, 3 insertions(+), 48 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index 83349df1f18cd..30333dc4b6d56 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -92,59 +92,16 @@ define amdgpu_cs void @with_calls_no_inline_const() #0 {
ret void
}
-; We're going to limit this to 16 VGPRs, so we need to spill the rest.
-define amdgpu_cs void @with_spills(ptr addrspace(1) %p1, ptr addrspace(1) %p2) #1 {
+define amdgpu_cs void @with_spills() {
; CHECK-LABEL: with_spills:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
-; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:96
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_cmp_lg_u32 0, s33
; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:80 ; 16-byte Folded Spill
-; CHECK-NEXT: s_clause 0x2
-; CHECK-NEXT: global_load_b128 v[8:11], v[0:1], off offset:112
-; CHECK-NEXT: global_load_b128 v[12:15], v[0:1], off offset:64
-; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:80
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:64 ; 16-byte Folded Spill
-; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:32
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:48 ; 16-byte Folded Spill
-; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:48
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:32 ; 16-byte Folded Spill
-; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16 ; 16-byte Folded Spill
-; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 ; 16-byte Folded Spill
-; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: s_clause 0x2
-; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:96
-; CHECK-NEXT: global_store_b128 v[2:3], v[8:11], off offset:112
-; CHECK-NEXT: global_store_b128 v[2:3], v[12:15], off offset:64
-; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:64 th:TH_LOAD_LU ; 16-byte Folded Reload
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:80
-; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:48 th:TH_LOAD_LU ; 16-byte Folded Reload
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:32
-; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:32 th:TH_LOAD_LU ; 16-byte Folded Reload
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:48
-; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:16 th:TH_LOAD_LU ; 16-byte Folded Reload
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off
-; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 th:TH_LOAD_LU ; 16-byte Folded Reload
-; CHECK-NEXT: s_wait_loadcnt 0x0
-; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:16
; CHECK-NEXT: s_alloc_vgpr 0
; CHECK-NEXT: s_endpgm
- %v = load <32 x i32>, ptr addrspace(1) %p1
- store <32 x i32> %v, ptr addrspace(1) %p2
+ call void asm "; spills", "~{v40},~{v42}"()
ret void
}
@@ -258,5 +215,3 @@ define void @default() #0 {
declare amdgpu_gfx void @callee(i32) #0
attributes #0 = { nounwind }
-attributes #1 = { nounwind "amdgpu-num-vgpr"="16"}
-
>From cf89dea971826c7c5c3be3b6c4385a59e9dd303b Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Sat, 8 Mar 2025 00:03:43 +0100
Subject: [PATCH 11/12] Tidy up and add tests with frame-ptr attr
---
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 5 +--
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 7 ----
.../dynamic-vgpr-reserve-stack-for-cwsr.ll | 34 +++++++++++++++++++
3 files changed, 37 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7f6020fc5b952..9c737b4f3e378 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -714,8 +714,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
assert(hasFP(MF));
Register FPReg = MFI->getFrameOffsetReg();
assert(FPReg != AMDGPU::FP_REG);
- Register SPReg = MFI->getStackPtrOffsetReg();
- assert(SPReg != AMDGPU::SP_REG);
unsigned VGPRSize =
llvm::alignTo((ST.getAddressableNumVGPRs() -
AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) *
@@ -732,6 +730,9 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize);
if (requiresStackPointerReference(MF)) {
+ Register SPReg = MFI->getStackPtrOffsetReg();
+ assert(SPReg != AMDGPU::SP_REG);
+
// If at least one of the constants can be inlined, then we can use
// s_cselect. Otherwise, use a mov and cmovk.
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm()) ||
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 4c6d5f2d459f7..e2bf272d69b60 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -512,13 +512,6 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const SIFrameLowering *TFI = ST.getFrameLowering();
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- // If we need to reserve scratch space for saving the VGPRs, then we should
- // use the frame register for accessing our own frame (which may start at a
- // non-zero offset).
- if (TFI->mayReserveScratchForCWSR(MF))
- return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
- : FuncInfo->getStackPtrOffsetReg();
-
// During ISel lowering we always reserve the stack pointer in entry and chain
// functions, but never actually want to reference it when accessing our own
// frame. If we need a frame pointer we use it, but otherwise we can just use
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index 30333dc4b6d56..ca2fca69dcf21 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -134,6 +134,38 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
ret void
}
+define amdgpu_cs void @frame_pointer_none() #1 {
+; CHECK-LABEL: frame_pointer_none:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-NEXT: v_mov_b32_e32 v0, 13
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %local = alloca i32, addrspace(5)
+ store volatile i8 13, ptr addrspace(5) %local
+ ret void
+}
+
+define amdgpu_cs void @frame_pointer_all() #2 {
+; CHECK-LABEL: frame_pointer_all:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-NEXT: v_mov_b32_e32 v0, 13
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %local = alloca i32, addrspace(5)
+ store volatile i8 13, ptr addrspace(5) %local
+ ret void
+}
+
; Non-entry functions and graphics shaders don't need to worry about CWSR.
define amdgpu_gs void @amdgpu_gs() #0 {
; CHECK-LABEL: amdgpu_gs:
@@ -215,3 +247,5 @@ define void @default() #0 {
declare amdgpu_gfx void @callee(i32) #0
attributes #0 = { nounwind }
+attributes #1 = { nounwind "frame-pointer"="none" }
+attributes #2 = { nounwind "frame-pointer"="all" }
>From 738a40f842fbbebc22170205146c488bb14fdb07 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Tue, 11 Mar 2025 14:21:44 +0100
Subject: [PATCH 12/12] Clarify unit for reserved scratch
---
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index e37fbc69b89b3..a60409b5a7e09 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -459,8 +459,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
unsigned NumSpilledSGPRs = 0;
unsigned NumSpilledVGPRs = 0;
- // The size of the scratch space reserved for the CWSR trap handler to spill
- // some of the dynamic VGPRs.
+ // The size in bytes of the scratch space reserved for the CWSR trap handler
+ // to spill some of the dynamic VGPRs.
unsigned ScratchReservedForDynamicVGPRs = 0;
// Tracks information about user SGPRs that will be setup by hardware which
@@ -793,8 +793,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
return ScratchReservedForDynamicVGPRs;
}
- void setScratchReservedForDynamicVGPRs(unsigned Size) {
- ScratchReservedForDynamicVGPRs = Size;
+ void setScratchReservedForDynamicVGPRs(unsigned SizeInBytes) {
+ ScratchReservedForDynamicVGPRs = SizeInBytes;
}
// Add user SGPRs.
More information about the llvm-commits
mailing list