[llvm] [AMDGPU] Restore SP correctly in functions with dynamic allocas (PR #122743)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 13 08:54:45 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Aaditya (easyonaadit)
<details>
<summary>Changes</summary>
Currently, the AMDGPU backend sets up FP and then
increments SP by fixed size, from FP, in the prolog and decrements it by the same amount in the epilog. Prolog:
`tmp = sp + (alignment - 1)`
`fp &= -alignment`
`sp += frameSize + alignment`
Epilog:
`sp -= (frameSize + alignment)`
In the presence of dynamic alloca, this leads to incorrect restoration of SP. This patch enforces the presence of a base pointer for all functions with dynamic allocas, and SP is restored from the saved BP in the epilog. Prolog:
`tmp = sp + (alignment - 1)`
`fp &= -alignment`
`bp = sp`
`sp += frameSize + alignment`
Epilog:
`sp += bp + frameSize + alignment`
`sp -= (frameSize + alignment)`
(Note: for dynamic allocas with default alignment, SP can be restored with saved FP as well. However, for the sake of uniformity, presence of BP is enforced)
Fixes: SWDEV-408164
---
Patch is 75.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122743.diff
8 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+11)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+4-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll (+44-9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/amdpal-callable.ll (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll (+195-53)
- (modified) llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll (+16)
- (modified) llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index dcd4f0f65e8ef2..274416ec054817 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1259,6 +1259,17 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
Register FramePtrRegScratchCopy;
Register SGPRForFPSaveRestoreCopy =
FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
+
+ if (MFI.hasVarSizedObjects()) {
+ assert(TRI.hasBasePointer(MF) &&
+ "Variable sized objects require base pointer to be setup!");
+ Register BasePtrReg = TRI.getBaseRegister();
+ // Restore SP to fixed frame size
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
+ .addReg(BasePtrReg)
+ .addImm(RoundedSize * getScratchScaleFactor(ST))
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
if (FPSaved) {
// CSR spill restores should use FP as base register. If
// SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 704435dad65d7b..5a4e6ec48da823 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -525,8 +525,11 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
// When we need stack realignment, we can't reference off of the
// stack pointer, so we reserve a base pointer.
+ // For functions with dynamically sized stack objects, we need to reference
+ // off the base pointer in the epilog to restore the stack frame.
const MachineFrameInfo &MFI = MF.getFrameInfo();
- return MFI.getNumFixedObjects() && shouldRealignStack(MF);
+ return (MFI.getNumFixedObjects() && shouldRealignStack(MF)) ||
+ MFI.hasVarSizedObjects();
}
Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
index ae055ea041297e..e497ed6526a059 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -69,6 +69,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s7, s33
; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_mov_b32 s8, s34
+; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
@@ -86,6 +88,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX9-NEXT: s_and_b32 s4, s4, -16
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s32, s6, s4
+; GFX9-NEXT: s_add_i32 s32, s34, 0x400
+; GFX9-NEXT: s_mov_b32 s34, s8
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -95,6 +99,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s7, s33
; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_mov_b32 s8, s34
+; GFX10-NEXT: s_mov_b32 s34, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
@@ -112,6 +118,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
; GFX10-NEXT: s_add_u32 s32, s6, s4
+; GFX10-NEXT: s_add_i32 s32, s34, 0x200
+; GFX10-NEXT: s_mov_b32 s34, s8
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -120,13 +128,15 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s3, s33
; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_mov_b32 s4, s34
+; GFX11-NEXT: s_mov_b32 s34, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s2, s32
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mov_b32 s33, s3
; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -136,8 +146,10 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, -16
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s32, s2, s0
+; GFX11-NEXT: s_add_i32 s32, s34, 16
+; GFX11-NEXT: s_mov_b32 s34, s4
; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_setpc_b64 s[30:31]
%n = load i32, ptr addrspace(4) @gv, align 4
@@ -210,6 +222,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s7, s33
; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_mov_b32 s8, s34
+; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
@@ -227,6 +241,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX9-NEXT: s_and_b32 s4, s4, -16
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s32, s6, s4
+; GFX9-NEXT: s_add_i32 s32, s34, 0x400
+; GFX9-NEXT: s_mov_b32 s34, s8
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -236,6 +252,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s7, s33
; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_mov_b32 s8, s34
+; GFX10-NEXT: s_mov_b32 s34, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
@@ -253,6 +271,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
; GFX10-NEXT: s_add_u32 s32, s6, s4
+; GFX10-NEXT: s_add_i32 s32, s34, 0x200
+; GFX10-NEXT: s_mov_b32 s34, s8
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -261,13 +281,15 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s3, s33
; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_mov_b32 s4, s34
+; GFX11-NEXT: s_mov_b32 s34, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s2, s32
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mov_b32 s33, s3
; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -277,8 +299,10 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, -16
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s32, s2, s0
+; GFX11-NEXT: s_add_i32 s32, s34, 16
+; GFX11-NEXT: s_mov_b32 s34, s4
; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_setpc_b64 s[30:31]
%n = load i32, ptr addrspace(4) @gv, align 16
@@ -355,6 +379,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX9-NEXT: s_mov_b32 s6, s33
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
+; GFX9-NEXT: s_mov_b32 s7, s34
+; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_addk_i32 s32, 0x1000
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
@@ -373,6 +399,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX9-NEXT: s_and_b32 s4, s4, -16
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s32, s5, s4
+; GFX9-NEXT: s_add_i32 s32, s34, 0x1000
+; GFX9-NEXT: s_mov_b32 s34, s7
; GFX9-NEXT: s_addk_i32 s32, 0xf000
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -382,8 +410,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s6, s33
; GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
-; GFX10-NEXT: s_addk_i32 s32, 0x800
+; GFX10-NEXT: s_mov_b32 s7, s34
; GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00
+; GFX10-NEXT: s_mov_b32 s34, s32
+; GFX10-NEXT: s_addk_i32 s32, 0x800
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
@@ -401,6 +431,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
; GFX10-NEXT: s_add_u32 s32, s5, s4
+; GFX10-NEXT: s_add_i32 s32, s34, 0x800
+; GFX10-NEXT: s_mov_b32 s34, s7
; GFX10-NEXT: s_addk_i32 s32, 0xf800
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -409,8 +441,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_add_i32 s33, s32, 31
-; GFX11-NEXT: s_add_i32 s32, s32, 64
+; GFX11-NEXT: s_mov_b32 s3, s34
; GFX11-NEXT: s_and_not1_b32 s33, s33, 31
+; GFX11-NEXT: s_mov_b32 s34, s32
+; GFX11-NEXT: s_add_i32 s32, s32, 64
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
@@ -429,7 +463,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-NEXT: s_add_u32 s32, s1, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s32, s34, 64
+; GFX11-NEXT: s_mov_b32 s34, s3
; GFX11-NEXT: s_addk_i32 s32, 0xffc0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%n = load i32, ptr addrspace(4) @gv
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index 69abef02d3d924..f4e6b7c033b3c4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -151,8 +151,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s7, s33
+; GCN-NEXT: s_mov_b32 s8, s34
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB2_3
@@ -178,8 +180,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
; GCN-NEXT: .LBB2_3: ; %bb.2
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_add_i32 s32, s34, 0x400
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_mov_b32 s34, s8
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s7
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -216,8 +220,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s7, s33
; GCN-NEXT: s_add_i32 s33, s32, 0xfc0
+; GCN-NEXT: s_mov_b32 s8, s34
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000
+; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: s_addk_i32 s32, 0x2000
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB3_2
@@ -240,8 +246,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; GCN-NEXT: .LBB3_2: ; %bb.1
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_add_i32 s32, s34, 0x2000
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_mov_b32 s34, s8
; GCN-NEXT: s_addk_i32 s32, 0xe000
; GCN-NEXT: s_mov_b32 s33, s7
; GCN-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index a5f915c48ebeea..2bd58f41ec7905 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -149,15 +149,15 @@ attributes #0 = { nounwind }
; GCN-NEXT: dynamic_stack:
; GCN-NEXT: .backend_stack_size: 0x10{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
-; GCN-NEXT: .sgpr_count: 0x28{{$}}
+; GCN-NEXT: .sgpr_count: 0x2a{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
; SDAG-NEXT: .vgpr_count: 0x2{{$}}
; GISEL-NEXT: .vgpr_count: 0x3{{$}}
; GCN-NEXT: dynamic_stack_loop:
; GCN-NEXT: .backend_stack_size: 0x10{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
-; SDAG-NEXT: .sgpr_count: 0x25{{$}}
-; GISEL-NEXT: .sgpr_count: 0x26{{$}}
+; SDAG-NEXT: .sgpr_count: 0x27{{$}}
+; GISEL-NEXT: .sgpr_count: 0x28{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
; SDAG-NEXT: .vgpr_count: 0x3{{$}}
; GISEL-NEXT: .vgpr_count: 0x4{{$}}
@@ -182,22 +182,22 @@ attributes #0 = { nounwind }
; GCN-NEXT: no_stack_extern_call:
; GCN-NEXT: .backend_stack_size: 0x10{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
-; GFX8-NEXT: .sgpr_count: 0x28{{$}}
-; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
+; GFX8-NEXT: .sgpr_count: 0x2a{{$}}
+; GFX9-NEXT: .sgpr_count: 0x2e{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
; GCN-NEXT: .vgpr_count: 0x2b{{$}}
; GCN-NEXT: no_stack_extern_call_many_args:
; GCN-NEXT: .backend_stack_size: 0x90{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
-; GFX8-NEXT: .sgpr_count: 0x28{{$}}
-; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
+; GFX8-NEXT: .sgpr_count: 0x2a{{$}}
+; GFX9-NEXT: .sgpr_count: 0x2e{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}}
; GCN-NEXT: .vgpr_count: 0x2b{{$}}
; GCN-NEXT: no_stack_indirect_call:
; GCN-NEXT: .backend_stack_size: 0x10{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
-; GFX8-NEXT: .sgpr_count: 0x28{{$}}
-; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
+; GFX8-NEXT: .sgpr_count: 0x2a{{$}}
+; GFX9-NEXT: .sgpr_count: 0x2e{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
; GCN-NEXT: .vgpr_count: 0x2b{{$}}
; GCN-NEXT: simple_lds:
@@ -227,15 +227,15 @@ attributes #0 = { nounwind }
; GCN-NEXT: simple_stack_extern_call:
; GCN-NEXT: .backend_stack_size: 0x20{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
-; GFX8-NEXT: .sgpr_count: 0x28{{$}}
-; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
+; GFX8-NEXT: .sgpr_count: 0x2a{{$}}
+; GFX9-NEXT: .sgpr_count: 0x2e{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
; GCN-NEXT: .vgpr_count: 0x2b{{$}}
; GCN-NEXT: simple_stack_indirect_call:
; GCN-NEXT: .backend_stack_size: 0x20{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
-; GFX8-NEXT: .sgpr_count: 0x28{{$}}
-; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
+; GFX8-NEXT: .sgpr_count: 0x2a{{$}}
+; GFX9-NEXT: .sgpr_count: 0x2e{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
; GCN-NEXT: .vgpr_count: 0x2b{{$}}
; GCN-NEXT: simple_stack_recurse:
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 9acb3a42ae102c..c298a5609ddb35 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -1064,10 +1064,12 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX9-SDAG-NEXT: s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s34
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
; GFX9-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT: s_mov_b32 s34, s32
; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400
; GFX9-SDAG-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
@@ -1082,8 +1084,10 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT: s_add_i32 s32, s34, 0x400
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b32 s34, s10
; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1093,10 +1097,12 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX9-GISEL-NEXT: s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT: s_mov_b32 s10, s34
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
; GFX9-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT: s_mov_b32 s34, s32
; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400
; GFX9-GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
@@ -1111,8 +1117,10 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: s_add_i32 s32, s34, 0x400
; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_mov_b32 s34, s10
; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-GISEL-NEXT: s_mov_b32 s33, s9
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1122,17 +1130,18 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX11-SDAG-NEXT: s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s34
; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
-; GFX11-SDAG-NEXT: s_mov_b32 s33, s32
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT: s_mov_b32 s34, s32
; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16
; GFX11-SDAG-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_delay_alu in...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/122743
More information about the llvm-commits
mailing list