[llvm] [AMDGPU] Make chain functions receive a stack pointer (PR #184616)

Wed Mar 4 06:18:10 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Diana Picus (rovka)

<details>
<summary>Changes</summary>

Currently, chain functions are free to set up a stack pointer if they need one, and they assume they can start at scratch offset 0. This is not correct if CWSR and dynamic VGPRs are both enabled, since in that case we need to reserve an area at offset 0 for the trap handler, but only when running on a compute queue (which we determine at runtime). Rather than duplicate in every chain function the code sequence for determining if/how much scratch space needs to be reserved, this patch changes the ABI of chain functions so that they receive a stack pointer from their caller.

Since chain functions can no longer use plain offsets to access their own stack, we'll also need to allocate a frame pointer more often (and sometimes also a base pointer). For simplicity, we use the same registers that `amdgpu_gfx` functions do (s32, s33, s34). This may change in the future. Chain functions never return to their caller and thus don't need to preserve the frame or base pointer.

Another consequence is that now we might need to realign the stack in some cases (since it no longer starts at the infinitely aligned 0).

---

Patch is 119.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/184616.diff


12 Files Affected:

- (modified) llvm/docs/AMDGPUUsage.rst (+4-4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h (+1-3) 
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+27-34) 
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (+6-16) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll (+220-33) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll (+221-86) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll (+176-94) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll (-4) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll (-16) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll (+184-1) 
- (modified) llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir (+4-4) 


``````````diff

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 70523faccc395..036b4461ec06d 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -2252,10 +2252,10 @@ The AMDGPU backend supports the following calling conventions:
                                      case the backend assumes that there are no inactive lanes upon entry; any inactive
                                      lanes that need to be preserved must be explicitly present in the IR).
 
-                                     Wave scratch is "empty" at function boundaries. There is no stack pointer input
-                                     or output value, but functions are free to use scratch starting from an initial
-                                     stack pointer. Calls to ``amdgpu_gfx`` functions are allowed and behave like they
-                                     do in ``amdgpu_cs`` functions.
+                                     Chain functions receive a stack pointer from their caller (in s32), similar to
+                                     ``amdgpu_gfx`` functions. If needed, the frame pointer is s33 and the base pointer
+                                     is s34. Calls to ``amdgpu_gfx`` functions are allowed and behave like they do in
+                                     ``amdgpu_cs`` functions.
 
                                      A function may have multiple exits (e.g. one chain exit and one plain ``ret void``
                                      for when the wave ends), but all ``llvm.amdgcn.cs.chain`` exits must be in
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 1317210a445d2..5e5ad3dbd713a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -101,9 +101,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   bool isChainFunction() const { return IsChainFunction; }
 
   // The stack is empty upon entry to this function.
-  bool isBottomOfStack() const {
-    return isEntryFunction() || isChainFunction();
-  }
+  bool isBottomOfStack() const { return isEntryFunction(); }
 
   bool isMemoryBound() const {
     return MemoryBound;
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 4a62af56fd8e5..4ce3965ad6494 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1216,33 +1216,25 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   // to determine the end of the prologue.
   DebugLoc DL;
 
-  if (FuncInfo->isChainFunction()) {
-    // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
-    // are free to set one up if they need it.
-    bool UseSP = requiresStackPointerReference(MF);
-    if (UseSP) {
-      assert(StackPtrReg != AMDGPU::SP_REG);
-
-      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
-          .addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
-    }
-  }
 
   bool HasFP = false;
   bool HasBP = false;
   uint32_t NumBytes = MFI.getStackSize();
   uint32_t RoundedSize = NumBytes;
 
+  // Chain functions always tail call, so there's no need to save and restore
+  // the FP or BP.
+  bool SavesStackRegs = !FuncInfo->isChainFunction();
+
   if (TRI.hasStackRealignment(MF))
     HasFP = true;
 
   Register FramePtrRegScratchCopy;
   if (!HasFP && !hasFP(MF)) {
     // Emit the CSR spill stores with SP base register.
-    emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
-                       FuncInfo->isChainFunction() ? Register() : StackPtrReg,
+    emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
                        FramePtrRegScratchCopy);
-  } else {
+  } else if (SavesStackRegs) {
     // CSR spill stores will use FP as base register.
     Register SGPRForFPSaveRestoreCopy =
         FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
@@ -1327,17 +1319,17 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
   (void)FPSaved;
-  assert((!HasFP || FPSaved) &&
+  assert((!HasFP || FPSaved || !SavesStackRegs) &&
          "Needed to save FP but didn't save it anywhere");
 
   // If we allow spilling to AGPRs we may have saved FP but then spill
   // everything into AGPRs instead of the stack.
-  assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
+  assert((HasFP || !FPSaved || !SavesStackRegs || EnableSpillVGPRToAGPR) &&
          "Saved FP but didn't need it");
 
   bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
   (void)BPSaved;
-  assert((!HasBP || BPSaved) &&
+  assert((!HasBP || BPSaved || !SavesStackRegs) &&
          "Needed to save BP but didn't save it anywhere");
 
   assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
@@ -1354,6 +1346,10 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   if (FuncInfo->isEntryFunction())
     return;
 
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (FuncInfo->isChainFunction() && !MFI.hasTailCall())
+    return;
+
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
@@ -1371,7 +1367,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
     MBBI = MBB.getFirstTerminator();
   }
 
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
   uint32_t NumBytes = MFI.getStackSize();
   uint32_t RoundedSize = FuncInfo->isStackRealigned()
                              ? NumBytes + MFI.getMaxAlign().value()
@@ -1427,8 +1422,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
       MIB.setMIFlag(MachineInstr::FrameDestroy);
   } else {
     // Insert the CSR spill restores with SP as the base register.
-    emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits,
-                         FuncInfo->isChainFunction() ? Register() : StackPtrReg,
+    emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
                          FramePtrRegScratchCopy);
   }
 }
@@ -1656,6 +1650,11 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
     MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
   }
 
+  // Chain functions don't return to the caller, so they don't need to preserve
+  // the FP and BP.
+  if (MFI->isChainFunction())
+    return;
+
   // hasFP only knows about stack objects that already exist. We're now
   // determining the stack slots that will be created, so we have to predict
   // them. Stack objects force FP usage with calls.
@@ -2154,11 +2153,10 @@ static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
 bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  // For entry & chain functions we can use an immediate offset in most cases,
+  // For entry functions we can use an immediate offset in most cases,
   // so the presence of calls doesn't imply we need a distinct frame pointer.
   if (MFI.hasCalls() &&
-      !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
-      !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
+      !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
     // All offsets are unsigned, so need to be addressed in the same direction
     // as stack growth.
 
@@ -2167,9 +2165,7 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
     return MFI.getStackSize() != 0;
   }
 
-  return (frameTriviallyRequiresSP(MFI) &&
-          !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) ||
-         MFI.isFrameAddressTaken() ||
+  return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
          MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
              MF) ||
          mayReserveScratchForCWSR(MF) ||
@@ -2188,21 +2184,18 @@ bool SIFrameLowering::mayReserveScratchForCWSR(
 // register. We may need to initialize the stack pointer depending on the frame
 // properties, which logically overlaps many of the cases where an ordinary
 // function would require an FP.
-// Also used for chain functions. While not technically entry functions, chain
-// functions may need to set up a stack pointer in some situations.
 bool SIFrameLowering::requiresStackPointerReference(
     const MachineFunction &MF) const {
   // Callable functions always require a stack pointer reference.
-  assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
-          MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
-         "only expected to call this for entry points and chain functions");
+  assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
+         "only expected to call this for entry points functions");
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // Entry points ordinarily don't need to initialize SP. We have to set it up
-  // for callees if there are any. Also note tail calls are impossible/don't
-  // make any sense for kernels.
-  if (MFI.hasCalls())
+  // for callees if there are any. Also note tail calls are only possible via
+  // the `llvm.amdgcn.cs.chain` intrinsic.
+  if (MFI.hasCalls() || MFI.hasTailCall())
     return true;
 
   // We still need to initialize the SP if we're doing anything weird that
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 25a28ec471913..32efae69b20c8 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -93,20 +93,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
     MinNumAGPRs = MinNumAGPRAttr;
   }
 
-  if (AMDGPU::isChainCC(CC)) {
-    // Chain functions don't receive an SP from their caller, but are free to
-    // set one up. For now, we can use s32 to match what amdgpu_gfx functions
-    // would use if called, but this can be revisited.
-    // FIXME: Only reserve this if we actually need it.
-    StackPtrOffsetReg = AMDGPU::SGPR32;
-
-    ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51;
-
-    ArgInfo.PrivateSegmentBuffer =
-        ArgDescriptor::createRegister(ScratchRSrcReg);
-
-    ImplicitArgPtr = false;
-  } else if (!isEntryFunction()) {
+  if (!isEntryFunction()) {
     if (CC != CallingConv::AMDGPU_Gfx &&
         CC != CallingConv::AMDGPU_Gfx_WholeWave)
       ArgInfo = AMDGPUFunctionArgInfo::FixedABIFunctionInfo;
@@ -117,13 +104,16 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
     if (!ST.hasFlatScratchEnabled()) {
       // Non-entry functions have no special inputs for now, other registers
       // required for scratch access.
-      ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+      ScratchRSrcReg = AMDGPU::isChainCC(CC)
+                           ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
+                           : ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
 
       ArgInfo.PrivateSegmentBuffer =
         ArgDescriptor::createRegister(ScratchRSrcReg);
     }
 
-    if (!F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
+    if (!F.hasFnAttribute("amdgpu-no-implicitarg-ptr") &&
+        !AMDGPU::isChainCC(CC))
       ImplicitArgPtr = true;
   } else {
     ImplicitArgPtr = false;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 2889f37a65d97..89d5d3dc18ea8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -39,7 +39,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, use at abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, use at abs32@hi
-; GISEL-GFX11-NEXT:    s_mov_b32 s32, 0
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
@@ -58,7 +58,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, use at abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, use at abs32@hi
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GISEL-GFX10-NEXT:    s_mov_b32 s32, 0
 ; GISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
 ;
@@ -71,7 +70,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, use at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, use at abs32@lo
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s32, 0
+; DAGISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; DAGISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; DAGISEL-GFX11-NEXT:    s_endpgm
 ;
@@ -90,7 +89,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, use at abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, use at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s32, 0
 ; DAGISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; DAGISEL-GFX10-NEXT:    s_endpgm
   call amdgpu_gfx void @use(<4 x i32> %sgpr, <4 x i32> %vgpr)
@@ -101,8 +99,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_spill:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    s_mov_b32 s32, 0
-; GISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
 ; GISEL-GFX11-NEXT:    s_add_u32 s24, s32, 4
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, s32
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v17, s24
@@ -123,6 +119,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v24, s24
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v25, s25
 ; GISEL-GFX11-NEXT:    s_add_u32 s24, s32, 40
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
@@ -170,7 +167,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v37, v13
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v38, v14
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v39, v15
-; GISEL-GFX10-NEXT:    s_mov_b32 s32, 0
 ; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], s32
 ; GISEL-GFX10-NEXT:    buffer_store_dword v17, off, s[48:51], s32 offset:4
 ; GISEL-GFX10-NEXT:    buffer_store_dword v18, off, s[48:51], s32 offset:8
@@ -229,8 +225,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_spill:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s32, 0
-; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
 ; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 60
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, s32
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v31, s24
@@ -251,6 +245,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v24, s24
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v23, s25
 ; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 24
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v36, v11 :: v_dual_mov_b32 v37, v10
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v38, v9 :: v_dual_mov_b32 v39, v8
@@ -298,7 +293,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v37, v10
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v38, v9
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v39, v8
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s32, 0
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], s32
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v17, off, s[48:51], s32 offset:4
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v18, off, s[48:51], s32 offset:8
@@ -362,11 +356,12 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; GISEL-GFX11:       ; %bb.0: ; %.entry
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GISEL-GFX11-NEXT:    s_mov_b32 s33, s32
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, use at abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s1, use at abs32@hi
-; GISEL-GFX11-NEXT:    s_mov_b32 s32, 16
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v0, off
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v0, s33
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s33
 ; GISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
@@ -374,13 +369,14 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; GISEL-GFX10:       ; %bb.0: ; %.entry
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GISEL-GFX10-NEXT:    s_mov_b32 s33, s32
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, use at abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, use at abs32@hi
+; GISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], s33
+; GISEL-GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-GFX10-NEXT:    s_movk_i32 s32, 0x200
+; GISEL-GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
 ;
@@ -388,11 +384,12 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; DAGISEL-GFX11:       ; %bb.0: ; %.entry
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s33, s32
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s1, use at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, use at abs32@lo
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s32, 16
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v0, off
-; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; DAGISEL-GFX11-NEXT:    s_add_i32 s32, s32, 16
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v0, s33
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s33
 ; DAGISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL-GFX11-NEXT:    s_endpgm
 ;
@@ -400,13 +397,14 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; DAGISEL-GFX10:       ; %bb.0: ; %.entry
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s33, s32
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, use at abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, use at abs32@lo
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], s33
+; DAGISEL-GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; DAGISEL-GFX10-NEXT:    s_movk_i32 s32, 0x200
+; DAGISEL-GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; DAGISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; DAGISEL-GFX10-NEXT:    s_endpgm
 .entry:
@@ -429,6 +427,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 s32, 0
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -442,6 +441,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/184616