[llvm] [Wip][Draft] Restore SP correctly in functions with dynamic allocas (PR #122676)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 13 08:48:03 PST 2025


https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/122676

>From eb69dcb1094155da0d51d062ed1a6eb2df546891 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Mon, 13 Jan 2025 14:45:20 +0530
Subject: [PATCH] [AMDGPU] Restore SP correctly in functions with dynamic
 allocas

Currently, the AMDGPU backend sets up FP and then
increments SP by fixed size, from FP, in the prolog
and decrements it by the same amount in the epilog.
Prolog:
`tmp = sp + (alignment - 1)`
`fp &= -alignment`
`sp += frameSize + alignment`
Epilog:
`sp -= (frameSize + alignment)`

In the presence of dynamic alloca, this leads to incorrect restoration of SP.
This patch enforces the presence of a base pointer for all functions with dynamic allocas,
and SP is restored from the saved BP in the epilog.
Prolog:
`tmp = sp + (alignment - 1)`
`fp &= -alignment`
`bp = sp`
`sp += frameSize + alignment`
Epilog:
`sp += bp + frameSize + alignment`
`sp -= (frameSize + alignment)`

(Note: for dynamic allocas with default alignment, SP can be restored with saved FP as well.
However, for the sake of uniformity, presence of BP is enforced)

Fixes: SWDEV-408164
---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  11 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |   5 +-
 .../GlobalISel/dynamic-alloca-uniform.ll      |  53 +++-
 .../AMDGPU/GlobalISel/non-entry-alloca.ll     |   8 +
 llvm/test/CodeGen/AMDGPU/amdpal-callable.ll   |  26 +-
 .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 248 ++++++++++++++----
 llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll  |  16 ++
 .../AMDGPU/pal-metadata-3.0-callable.ll       |   4 +-
 8 files changed, 293 insertions(+), 78 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index dcd4f0f65e8ef2..274416ec054817 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1259,6 +1259,17 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   Register FramePtrRegScratchCopy;
   Register SGPRForFPSaveRestoreCopy =
       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
+
+  if (MFI.hasVarSizedObjects()) {
+    assert(TRI.hasBasePointer(MF) &&
+           "Variable sized objects require base pointer to be setup!");
+    Register BasePtrReg = TRI.getBaseRegister();
+    // Restore SP to fixed frame size
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
+        .addReg(BasePtrReg)
+        .addImm(RoundedSize * getScratchScaleFactor(ST))
+        .setMIFlag(MachineInstr::FrameDestroy);
+  }
   if (FPSaved) {
     // CSR spill restores should use FP as base register. If
     // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 704435dad65d7b..5a4e6ec48da823 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -525,8 +525,11 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   // When we need stack realignment, we can't reference off of the
   // stack pointer, so we reserve a base pointer.
+  // For functions with dynamically sized stack objects, we need to reference
+  // off the base pointer in the epilog to restore the stack frame.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  return MFI.getNumFixedObjects() && shouldRealignStack(MF);
+  return (MFI.getNumFixedObjects() && shouldRealignStack(MF)) ||
+         MFI.hasVarSizedObjects();
 }
 
 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
index ae055ea041297e..e497ed6526a059 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -69,6 +69,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s7, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_mov_b32 s8, s34
+; GFX9-NEXT:    s_mov_b32 s34, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
@@ -86,6 +88,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX9-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
 ; GFX9-NEXT:    s_add_u32 s32, s6, s4
+; GFX9-NEXT:    s_add_i32 s32, s34, 0x400
+; GFX9-NEXT:    s_mov_b32 s34, s8
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -95,6 +99,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s7, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_mov_b32 s8, s34
+; GFX10-NEXT:    s_mov_b32 s34, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
@@ -112,6 +118,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX10-NEXT:    s_and_b32 s4, s4, -16
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
 ; GFX10-NEXT:    s_add_u32 s32, s6, s4
+; GFX10-NEXT:    s_add_i32 s32, s34, 0x200
+; GFX10-NEXT:    s_mov_b32 s34, s8
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -120,13 +128,15 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_mov_b32 s4, s34
+; GFX11-NEXT:    s_mov_b32 s34, s32
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s2, s32
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -136,8 +146,10 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_u32 s32, s2, s0
+; GFX11-NEXT:    s_add_i32 s32, s34, 16
+; GFX11-NEXT:    s_mov_b32 s34, s4
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv, align 4
@@ -210,6 +222,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s7, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_mov_b32 s8, s34
+; GFX9-NEXT:    s_mov_b32 s34, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
@@ -227,6 +241,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX9-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
 ; GFX9-NEXT:    s_add_u32 s32, s6, s4
+; GFX9-NEXT:    s_add_i32 s32, s34, 0x400
+; GFX9-NEXT:    s_mov_b32 s34, s8
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -236,6 +252,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s7, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_mov_b32 s8, s34
+; GFX10-NEXT:    s_mov_b32 s34, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
@@ -253,6 +271,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX10-NEXT:    s_and_b32 s4, s4, -16
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
 ; GFX10-NEXT:    s_add_u32 s32, s6, s4
+; GFX10-NEXT:    s_add_i32 s32, s34, 0x200
+; GFX10-NEXT:    s_mov_b32 s34, s8
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -261,13 +281,15 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_mov_b32 s4, s34
+; GFX11-NEXT:    s_mov_b32 s34, s32
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s2, s32
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -277,8 +299,10 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_u32 s32, s2, s0
+; GFX11-NEXT:    s_add_i32 s32, s34, 16
+; GFX11-NEXT:    s_mov_b32 s34, s4
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv, align 16
@@ -355,6 +379,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x7c0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffff800
+; GFX9-NEXT:    s_mov_b32 s7, s34
+; GFX9-NEXT:    s_mov_b32 s34, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x1000
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
@@ -373,6 +399,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
 ; GFX9-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-NEXT:    s_add_i32 s32, s34, 0x1000
+; GFX9-NEXT:    s_mov_b32 s34, s7
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf000
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -382,8 +410,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0x3e0
-; GFX10-NEXT:    s_addk_i32 s32, 0x800
+; GFX10-NEXT:    s_mov_b32 s7, s34
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; GFX10-NEXT:    s_mov_b32 s34, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x800
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
@@ -401,6 +431,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    s_and_b32 s4, s4, -16
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
 ; GFX10-NEXT:    s_add_u32 s32, s5, s4
+; GFX10-NEXT:    s_add_i32 s32, s34, 0x800
+; GFX10-NEXT:    s_mov_b32 s34, s7
 ; GFX10-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -409,8 +441,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 31
-; GFX11-NEXT:    s_add_i32 s32, s32, 64
+; GFX11-NEXT:    s_mov_b32 s3, s34
 ; GFX11-NEXT:    s_and_not1_b32 s33, s33, 31
+; GFX11-NEXT:    s_mov_b32 s34, s32
+; GFX11-NEXT:    s_add_i32 s32, s32, 64
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
@@ -429,7 +463,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s32, s34, 64
+; GFX11-NEXT:    s_mov_b32 s34, s3
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffc0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index 69abef02d3d924..f4e6b7c033b3c4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -151,8 +151,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s7, s33
+; GCN-NEXT:    s_mov_b32 s8, s34
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_mov_b32 s34, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB2_3
@@ -178,8 +180,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; GCN-NEXT:  .LBB2_3: ; %bb.2
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_add_i32 s32, s34, 0x400
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b32 s34, s8
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -216,8 +220,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s7, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GCN-NEXT:    s_mov_b32 s8, s34
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GCN-NEXT:    s_mov_b32 s34, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x2000
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB3_2
@@ -240,8 +246,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; GCN-NEXT:  .LBB3_2: ; %bb.1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_add_i32 s32, s34, 0x2000
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b32 s34, s8
 ; GCN-NEXT:    s_addk_i32 s32, 0xe000
 ; GCN-NEXT:    s_mov_b32 s33, s7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index a5f915c48ebeea..2bd58f41ec7905 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -149,15 +149,15 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      dynamic_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x28{{$}}
+; GCN-NEXT:        .sgpr_count:     0x2a{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; SDAG-NEXT:        .vgpr_count:     0x2{{$}}
 ; GISEL-NEXT:        .vgpr_count:     0x3{{$}}
 ; GCN-NEXT:      dynamic_stack_loop:
 ; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; SDAG-NEXT:        .sgpr_count:     0x25{{$}}
-; GISEL-NEXT:        .sgpr_count:     0x26{{$}}
+; SDAG-NEXT:        .sgpr_count:     0x27{{$}}
+; GISEL-NEXT:        .sgpr_count:     0x28{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; SDAG-NEXT:        .vgpr_count:     0x3{{$}}
 ; GISEL-NEXT:        .vgpr_count:     0x4{{$}}
@@ -182,22 +182,22 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      no_stack_extern_call:
 ; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x2a{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x2e{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; GCN-NEXT:        .vgpr_count:    0x2b{{$}}
 ; GCN-NEXT:      no_stack_extern_call_many_args:
 ; GCN-NEXT:        .backend_stack_size: 0x90{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x2a{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x2e{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x90{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
 ; GCN-NEXT:      no_stack_indirect_call:
 ; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x2a{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x2e{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
 ; GCN-NEXT:      simple_lds:
@@ -227,15 +227,15 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      simple_stack_extern_call:
 ; GCN-NEXT:        .backend_stack_size: 0x20{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x2a{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x2e{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
 ; GCN-NEXT:      simple_stack_indirect_call:
 ; GCN-NEXT:        .backend_stack_size: 0x20{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x2a{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x2e{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
 ; GCN-NEXT:      simple_stack_recurse:
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 9acb3a42ae102c..c298a5609ddb35 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -1064,10 +1064,12 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s34
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-SDAG-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1082,8 +1084,10 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s10
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1093,10 +1097,12 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1111,8 +1117,10 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1122,17 +1130,18 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-SDAG-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -1144,7 +1153,8 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s34, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s5
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1153,28 +1163,31 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX11-GISEL-NEXT:  ; %bb.2:
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s34, 16
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, addrspace(5)
@@ -1189,10 +1202,12 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s34
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
 ; GFX9-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1208,8 +1223,10 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 10
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s34, 0x4000
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s10
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xc000
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1220,10 +1237,12 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
 ; GFX9-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1239,8 +1258,10 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 10
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_add_i32 s32, s34, 0x4000
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xc000
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1251,17 +1272,18 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
 ; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
 ; GFX11-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
@@ -1274,7 +1296,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s34, 0x100
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s5
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff00
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1284,17 +1307,18 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
 ; GFX11-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
@@ -1305,8 +1329,10 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s34, 0x100
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff00
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 128, addrspace(5)
@@ -1320,10 +1346,12 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s34
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-SDAG-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1338,8 +1366,10 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s10
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1349,10 +1379,12 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1367,8 +1399,10 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 22
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1378,17 +1412,18 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-SDAG-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB10_1
@@ -1400,7 +1435,8 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s34, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s5
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1409,28 +1445,31 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX11-GISEL-NEXT:  ; %bb.2:
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 22
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 22
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s34, 16
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 2, addrspace(5)
@@ -1445,10 +1484,12 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s34
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-SDAG-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1463,8 +1504,10 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s10
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1475,10 +1518,12 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-GISEL-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1493,8 +1538,10 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1504,10 +1551,12 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
@@ -1528,7 +1577,8 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s34, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s5
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1537,10 +1587,12 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
@@ -1554,13 +1606,15 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX11-GISEL-NEXT:  ; %bb.2:
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s34, 16
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1575,6 +1629,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s10, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s34
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
@@ -1596,8 +1652,10 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s34, 0x4000
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s11
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xc000
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s10
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1609,10 +1667,12 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
 ; GFX9-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1628,8 +1688,10 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_add_i32 s32, s34, 0x4000
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xc000
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1640,14 +1702,16 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT:    s_mov_b32 s6, s34
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
 ; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX11-SDAG-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -1663,8 +1727,10 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s34, 0x100
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s6
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff00
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1674,12 +1740,13 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX11-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
@@ -1697,8 +1764,10 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s34, 0x100
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff00
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1714,10 +1783,12 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s34
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-SDAG-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1732,8 +1803,10 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s10
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1744,10 +1817,12 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-GISEL-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1762,8 +1837,10 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1773,10 +1850,12 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
@@ -1797,7 +1876,8 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s34, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s5
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1806,10 +1886,12 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
@@ -1823,13 +1905,15 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB13_1
 ; GFX11-GISEL-NEXT:  ; %bb.2:
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s34, 16
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1844,9 +1928,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s13, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s14, s34
 ; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x3000
 ; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB14_6
@@ -1911,10 +1997,12 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s34, 0x3000
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s14
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xd000
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s13
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1924,9 +2012,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s13, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s14, s34
 ; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x3000
 ; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB14_6
@@ -1986,14 +2076,16 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-GISEL-NEXT:  ; %bb.8:
 ; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s8, 6
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_i32 s32, s34, 0x3000
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s14
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xd000
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s13
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2003,9 +2095,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s7, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s8, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xc0
 ; GFX11-SDAG-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB14_6
@@ -2079,9 +2173,10 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s34, 0xc0
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s8
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff40
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
@@ -2089,9 +2184,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b32 s7, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xc0
 ; GFX11-GISEL-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB14_6
@@ -2154,16 +2251,18 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_7
 ; GFX11-GISEL-NEXT:  ; %bb.8:
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s7
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s34, 0xc0
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s8
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff40
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %n, 0
@@ -2189,9 +2288,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s11, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s12, s34
 ; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
 ; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x2000
 ; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2243,6 +2344,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:  .LBB15_8: ; %bb.2
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s34, 0x2000
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s12
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xe000
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s11
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -2253,9 +2356,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s11, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s12, s34
 ; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x2000
 ; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2307,6 +2412,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:  .LBB15_8: ; %bb.2
 ; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_add_i32 s32, s34, 0x2000
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s12
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xe000
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -2317,9 +2424,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s6, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x80
 ; GFX11-SDAG-NEXT:    v_cmpx_ne_u32_e32 0, v0
 ; GFX11-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -2376,7 +2485,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:  .LBB15_8: ; %bb.2
 ; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s34, 0x80
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s6
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff80
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -2386,9 +2496,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b32 s5, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x80
 ; GFX11-GISEL-NEXT:    v_cmpx_ne_u32_e32 0, v0
 ; GFX11-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -2446,6 +2558,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:  .LBB15_8: ; %bb.2
 ; GFX11-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s34, 0x80
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s6
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff80
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2472,10 +2586,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s34
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff0, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-SDAG-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -2490,8 +2606,10 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s10
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -2502,10 +2620,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-GISEL-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -2520,8 +2640,10 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2531,10 +2653,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff0, v0
@@ -2555,7 +2679,8 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s34, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s5
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2564,10 +2689,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
@@ -2581,13 +2708,15 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX11-GISEL-NEXT:  ; %bb.2:
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s34, 16
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i16 %n, align 2, addrspace(5)
@@ -2601,10 +2730,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s34
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-SDAG-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -2619,8 +2750,10 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s10
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -2630,10 +2763,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-GISEL-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -2648,8 +2783,10 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_i32 s32, s34, 0x400
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2659,17 +2796,18 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-SDAG-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB17_1
@@ -2681,7 +2819,8 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s34, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s5
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2690,28 +2829,31 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-GISEL-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX11-GISEL-NEXT:  ; %bb.2:
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s34, 16
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i64 %n, align 2, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 2bd60e869f843a..da130bc4030354 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -219,8 +219,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; MUBUF:       ; %bb.0: ; %entry
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_mov_b32 s7, s33
+; MUBUF-NEXT:    s_mov_b32 s8, s34
 ; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_mov_b32 s34, s32
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
 ; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; MUBUF-NEXT:    s_cbranch_execz .LBB2_3
@@ -244,8 +246,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; MUBUF-NEXT:  .LBB2_3: ; %bb.2
 ; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    s_add_i32 s32, s34, 0x400
 ; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s34, s8
 ; MUBUF-NEXT:    s_addk_i32 s32, 0xfc00
 ; MUBUF-NEXT:    s_mov_b32 s33, s7
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
@@ -254,8 +258,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; FLATSCR:       ; %bb.0: ; %entry
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_mov_b32 s3, s33
+; FLATSCR-NEXT:    s_mov_b32 s4, s34
 ; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_mov_b32 s34, s32
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
 ; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; FLATSCR-NEXT:    s_cbranch_execz .LBB2_3
@@ -278,8 +284,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; FLATSCR-NEXT:  .LBB2_3: ; %bb.2
 ; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s32, s34, 16
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s34, s4
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, -16
 ; FLATSCR-NEXT:    s_mov_b32 s33, s3
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
@@ -316,8 +324,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_mov_b32 s7, s33
 ; MUBUF-NEXT:    s_add_i32 s33, s32, 0xfc0
+; MUBUF-NEXT:    s_mov_b32 s8, s34
 ; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; MUBUF-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; MUBUF-NEXT:    s_mov_b32 s34, s32
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x2000
 ; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; MUBUF-NEXT:    s_cbranch_execz .LBB3_2
@@ -339,8 +349,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; MUBUF-NEXT:  .LBB3_2: ; %bb.1
 ; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    s_add_i32 s32, s34, 0x2000
 ; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s34, s8
 ; MUBUF-NEXT:    s_addk_i32 s32, 0xe000
 ; MUBUF-NEXT:    s_mov_b32 s33, s7
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
@@ -350,8 +362,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_mov_b32 s3, s33
 ; FLATSCR-NEXT:    s_add_i32 s33, s32, 63
+; FLATSCR-NEXT:    s_mov_b32 s4, s34
 ; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; FLATSCR-NEXT:    s_andn2_b32 s33, s33, 63
+; FLATSCR-NEXT:    s_mov_b32 s34, s32
 ; FLATSCR-NEXT:    s_addk_i32 s32, 0x80
 ; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; FLATSCR-NEXT:    s_cbranch_execz .LBB3_2
@@ -371,8 +385,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; FLATSCR-NEXT:  .LBB3_2: ; %bb.1
 ; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s32, s34, 0x80
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s34, s4
 ; FLATSCR-NEXT:    s_addk_i32 s32, 0xff80
 ; FLATSCR-NEXT:    s_mov_b32 s33, s3
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index 538ce15979de88..eb67d92fa34efb 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -68,13 +68,13 @@
 ; CHECK-NEXT:      dynamic_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x10
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .sgpr_count:     0x23
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
 ; CHECK-NEXT:        .vgpr_count:     0x2
 ; CHECK-NEXT:      dynamic_stack_loop:
 ; CHECK-NEXT:        .backend_stack_size: 0x10
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .sgpr_count:     0x23
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
 ; CHECK-NEXT:        .vgpr_count:     0x3
 ; CHECK-NEXT:      multiple_stack:



More information about the llvm-commits mailing list