[llvm] [AMDGPU] Fix iterator invalidation during frame lowering (PR #163952)

Diana Picus via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 20 00:11:03 PDT 2025


https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/163952

>From 4c17de93e50aae2810081d7c8fdf26d36af8bed6 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 17 Oct 2025 12:07:54 +0200
Subject: [PATCH] [AMDGPU] Fix iterator invalidation during frame lowering

I was a bit too eager to remove the SI_WHOLE_WAVE_FUNC_SETUP instruction
during prolog emission. Erasing it invalidates MBBI, which in some cases
is still needed outside of `emitCSRSpillStores`.

Do the erasing at the end of prolog insertion instead.
---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  12 +-
 .../CodeGen/AMDGPU/whole-wave-functions.ll    | 130 ++++++++++++++++++
 2 files changed, 137 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0189e7b90ca94..5c39f7a3d6daa 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1034,16 +1034,13 @@ void SIFrameLowering::emitCSRSpillStores(
 
   StoreWWMRegisters(WWMCalleeSavedRegs);
   if (FuncInfo->isWholeWaveFunction()) {
-    // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
-    // it now. If we have already saved some WWM CSR registers, then the EXEC is
-    // already -1 and we don't need to do anything else. Otherwise, set EXEC to
-    // -1 here.
+    // If we have already saved some WWM CSR registers, then the EXEC is already
+    // -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here.
     if (!ScratchExecCopy)
       buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
                            /*EnableInactiveLanes*/ true);
     else if (WWMCalleeSavedRegs.empty())
       EnableAllLanes();
-    TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
   } else if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
     BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
@@ -1340,6 +1337,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
          "Needed to save BP but didn't save it anywhere");
 
   assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
+
+  if (FuncInfo->isWholeWaveFunction()) {
+    // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose.
+    TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
+  }
 }
 
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index e3437fded0429..a42c8ac706d27 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -767,6 +767,136 @@ define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) {
   ret void
 }
 
+define amdgpu_gfx_whole_wave void @realign_stack(i1 %active, i32 %x) {
+; DAGISEL-LABEL: realign_stack:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_mov_b32 s1, s33
+; DAGISEL-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    s_mov_b32 s2, s34
+; DAGISEL-NEXT:    s_mov_b32 s34, s32
+; DAGISEL-NEXT:    s_addk_co_i32 s32, 0x800
+; DAGISEL-NEXT:    s_wait_storecnt 0x0
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; DAGISEL-NEXT:    s_wait_storecnt 0x0
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_mov_b32 s32, s34
+; DAGISEL-NEXT:    s_mov_b32 s34, s2
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_mov_b32 s33, s1
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: realign_stack:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_mov_b32 s1, s33
+; GISEL-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT:    s_mov_b32 s2, s34
+; GISEL-NEXT:    s_mov_b32 s34, s32
+; GISEL-NEXT:    s_addk_co_i32 s32, 0x800
+; GISEL-NEXT:    s_wait_storecnt 0x0
+; GISEL-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; GISEL-NEXT:    s_wait_storecnt 0x0
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_mov_b32 s32, s34
+; GISEL-NEXT:    s_mov_b32 s34, s2
+; GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL-NEXT:    s_mov_b32 s33, s1
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: realign_stack:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_mov_b32 s2, s33
+; DAGISEL64-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT:    s_mov_b32 s3, s34
+; DAGISEL64-NEXT:    s_mov_b32 s34, s32
+; DAGISEL64-NEXT:    s_addk_co_i32 s32, 0x800
+; DAGISEL64-NEXT:    s_wait_storecnt 0x0
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; DAGISEL64-NEXT:    s_wait_storecnt 0x0
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_mov_b32 s32, s34
+; DAGISEL64-NEXT:    s_mov_b32 s34, s3
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT:    s_mov_b32 s33, s2
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: realign_stack:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_mov_b32 s2, s33
+; GISEL64-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; GISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT:    s_mov_b32 s3, s34
+; GISEL64-NEXT:    s_mov_b32 s34, s32
+; GISEL64-NEXT:    s_addk_co_i32 s32, 0x800
+; GISEL64-NEXT:    s_wait_storecnt 0x0
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; GISEL64-NEXT:    s_wait_storecnt 0x0
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_mov_b32 s32, s34
+; GISEL64-NEXT:    s_mov_b32 s34, s3
+; GISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT:    s_mov_b32 s33, s2
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-DAGISEL-LABEL: realign_stack:
+; GFX1250-DAGISEL:       ; %bb.0:
+; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s1, s33
+; GFX1250-DAGISEL-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; GFX1250-DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-DAGISEL-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s2, s34
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s34, s32
+; GFX1250-DAGISEL-NEXT:    s_addk_co_i32 s32, 0x800
+; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; GFX1250-DAGISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s32, s34
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s34, s2
+; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s33, s1
+; GFX1250-DAGISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %fussy = alloca i32, align 1024, addrspace(5)
+  store volatile i32 %x, ptr addrspace(5) %fussy, align 1024
+  ret void
+}
+
 define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL-LABEL: multiple_blocks:
 ; DAGISEL:       ; %bb.0:



More information about the llvm-commits mailing list