[llvm-branch-commits] [llvm] [amdgpu-cfi: 5/9]: [AMDGPU] Implement CFI for non-kernel functions (PR #183153)
Scott Linder via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Apr 6 13:48:17 PDT 2026
slinder1 wrote:
Changes since last push:
```diff
diff --git b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1445,6 +1445,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
uint32_t NumBytes = MFI.getStackSize();
uint32_t RoundedSize = NumBytes;
+ // Chain functions never return, so there's no need to save and restore the FP
+ // or BP.
+ bool SavesStackRegs = !FuncInfo->isChainFunction();
+
const bool NeedsFrameMoves = MF.needsFrameMoves();
if (NeedsFrameMoves)
@@ -1456,10 +1460,9 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
Register FramePtrRegScratchCopy;
if (!HasFP && !hasFP(MF)) {
// Emit the CSR spill stores with SP base register.
- emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
- FuncInfo->isChainFunction() ? Register() : StackPtrReg,
+ emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
FramePtrRegScratchCopy, NeedsFrameMoves);
- } else {
+ } else if (SavesStackRegs) {
// CSR spill stores will use FP as base register.
Register SGPRForFPSaveRestoreCopy =
FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
diff --git b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -952,7 +952,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) #0 {
; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GISEL-GFX11-NEXT: s_endpgm
;
-; GISEL-GFX10-LABEL: amdgpu_cs_chain_realign_stack:
+; GISEL-GFX10-LABEL: amdgpu_cs_chain_dont_realign_stack:
; GISEL-GFX10: ; %bb.0:
; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
@@ -976,7 +976,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) #0 {
; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GISEL-GFX10-NEXT: s_endpgm
;
-; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_realign_stack:
+; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_dont_realign_stack:
; DAGISEL-GFX11: ; %bb.0:
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX11-NEXT: s_add_i32 s33, s32, 31
@@ -990,7 +990,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) #0 {
; DAGISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; DAGISEL-GFX11-NEXT: s_endpgm
;
-; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_realign_stack:
+; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_dont_realign_stack:
; DAGISEL-GFX10: ; %bb.0:
; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
@@ -1018,119 +1018,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) #0 {
ret void
}
-define amdgpu_cs_chain void @amdgpu_cs_chain_realign_stack_chain_call(i32 %idx, <3 x i32> inreg %a, <3 x i32> %b) {
-; GISEL-GFX11-LABEL: amdgpu_cs_chain_realign_stack_chain_call:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_mov_b32 s7, 4
-; GISEL-GFX11-NEXT: s_mov_b32 s6, 3
-; GISEL-GFX11-NEXT: s_mov_b32 s5, 2
-; GISEL-GFX11-NEXT: s_mov_b32 s4, 1
-; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v8
-; GISEL-GFX11-NEXT: s_add_i32 s33, s32, 31
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GISEL-GFX11-NEXT: s_and_not1_b32 s33, s33, 31
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GISEL-GFX11-NEXT: v_add_nc_u32_e32 v4, s33, v4
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v10, v11
-; GISEL-GFX11-NEXT: s_mov_b32 s34, s32
-; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee at abs32@lo
-; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee at abs32@hi
-; GISEL-GFX11-NEXT: s_addk_i32 s32, 0xc0
-; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
-; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GISEL-GFX11-NEXT: s_mov_b32 s32, s34
-; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5]
-;
-; GISEL-GFX10-LABEL: amdgpu_cs_chain_realign_stack_chain_call:
-; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
-; GISEL-GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v8
-; GISEL-GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GISEL-GFX10-NEXT: v_lshrrev_b32_e64 v2, 5, s33
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v9
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v9, v10
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v10, v11
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, 3
-; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, 2
-; GISEL-GFX10-NEXT: s_mov_b32 s34, s32
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v4, 4
-; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee at abs32@lo
-; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee at abs32@hi
-; GISEL-GFX10-NEXT: s_addk_i32 s32, 0x1800
-; GISEL-GFX10-NEXT: buffer_store_dword v1, v0, s[48:51], 0 offen
-; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GISEL-GFX10-NEXT: buffer_store_dword v2, v0, s[48:51], 0 offen offset:4
-; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GISEL-GFX10-NEXT: buffer_store_dword v3, v0, s[48:51], 0 offen offset:8
-; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GISEL-GFX10-NEXT: buffer_store_dword v4, v0, s[48:51], 0 offen offset:12
-; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GISEL-GFX10-NEXT: s_mov_b32 s32, s34
-; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5]
-;
-; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_realign_stack_chain_call:
-; DAGISEL-GFX11: ; %bb.0:
-; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT: s_add_i32 s33, s32, 31
-; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
-; DAGISEL-GFX11-NEXT: s_and_not1_b32 s33, s33, 31
-; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; DAGISEL-GFX11-NEXT: v_lshl_add_u32 v4, v8, 4, s33
-; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10
-; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v10, v11
-; DAGISEL-GFX11-NEXT: s_mov_b32 s34, s32
-; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee at abs32@hi
-; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee at abs32@lo
-; DAGISEL-GFX11-NEXT: s_addk_i32 s32, 0xc0
-; DAGISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
-; DAGISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; DAGISEL-GFX11-NEXT: s_mov_b32 s32, s34
-; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5]
-;
-; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_realign_stack_chain_call:
-; DAGISEL-GFX10: ; %bb.0:
-; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, 3
-; DAGISEL-GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, 2
-; DAGISEL-GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33
-; DAGISEL-GFX10-NEXT: s_mov_b32 s34, s32
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v4, 1
-; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee at abs32@hi
-; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee at abs32@lo
-; DAGISEL-GFX10-NEXT: v_lshl_add_u32 v0, v8, 4, v1
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v9
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v9, v10
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v10, v11
-; DAGISEL-GFX10-NEXT: s_addk_i32 s32, 0x1800
-; DAGISEL-GFX10-NEXT: buffer_store_dword v1, v0, s[48:51], 0 offen offset:12
-; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; DAGISEL-GFX10-NEXT: buffer_store_dword v2, v0, s[48:51], 0 offen offset:8
-; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; DAGISEL-GFX10-NEXT: buffer_store_dword v3, v0, s[48:51], 0 offen offset:4
-; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; DAGISEL-GFX10-NEXT: buffer_store_dword v4, v0, s[48:51], 0 offen
-; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; DAGISEL-GFX10-NEXT: s_mov_b32 s32, s34
-; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5]
- %alloca.align32 = alloca [8 x <4 x i32>], align 32, addrspace(5)
- %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align32, i32 0, i32 %idx
- store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %gep0, align 32
- call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
- unreachable
-}
-
declare void @llvm.amdgcn.cs.chain.v2i32(ptr, i32, <2 x i32>, <2 x i32>, i32, ...)
declare void @llvm.amdgcn.cs.chain.v3i32(ptr, i32, <3 x i32>, <3 x i32>, i32, ...)
declare void @llvm.amdgcn.cs.chain.v4i32(ptr, i32, <4 x i32>, <4 x i32>, i32, ...)
diff --git b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 < %s 2>&1 | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefix=GFX12 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=GFX942 %s
; These situations are "special" in that they either have an alloca that is not
@@ -22,7 +22,7 @@ define amdgpu_cs_chain void @test_alloca() #0 {
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_mov_b32 s0, s32
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: s_add_co_i32 s32, s0, 0x400
+; GFX12-NEXT: s_add_co_i32 s32, s0, 0x200
; GFX12-NEXT: scratch_store_b32 off, v0, s0
; GFX12-NEXT: s_endpgm
;
@@ -61,7 +61,7 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) #0 {
; GFX12-NEXT: s_and_b32 s0, s0, -16
; GFX12-NEXT: s_mov_b32 s1, s32
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: s_lshl_b32 s0, s0, 6
+; GFX12-NEXT: s_lshl_b32 s0, s0, 5
; GFX12-NEXT: scratch_store_b32 off, v0, s1
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s32, s1, s0
@@ -93,41 +93,32 @@ define amdgpu_cs_chain void @test_alloca_var(i32 %count) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshl_add_u32 v3, v8, 2, 15
+; GFX12-NEXT: v_lshl_add_u32 v2, v8, 2, 15
; GFX12-NEXT: s_mov_b32 s33, s32
; GFX12-NEXT: s_add_co_i32 s32, s32, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, -16, v3
-; GFX12-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX12-NEXT: v_and_b32_e32 v2, -16, v2
+; GFX12-NEXT: s_or_saveexec_b32 s0, -1
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v3, s[0:1]
-; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX12-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX12-NEXT: v_add_nc_u32_e32 v2, 32, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX12-NEXT: v_mul_lo_u32 v2, 4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX12-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
-; GFX12-NEXT: ds_permute_b32 v1, v2, v0
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_readlane_b32 s2, v0, 63
-; GFX12-NEXT: s_mov_b64 exec, s[0:1]
+; GFX12-NEXT: v_readlane_b32 s1, v0, 31
+; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_mov_b32 s0, s32
-; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: v_lshl_add_u32 v3, s2, 6, s0
-; GFX12-NEXT: scratch_store_b32 off, v4, s0
-; GFX12-NEXT: v_readfirstlane_b32 s32, v3
+; GFX12-NEXT: v_lshl_add_u32 v2, s1, 5, s0
+; GFX12-NEXT: scratch_store_b32 off, v3, s0
+; GFX12-NEXT: v_readfirstlane_b32 s32, v2
; GFX12-NEXT: s_endpgm
;
; GFX942-LABEL: test_alloca_var:
@@ -187,7 +178,7 @@ define amdgpu_cs_chain void @test_alloca_and_call() #0 {
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX12-NEXT: s_mov_b32 s2, s32
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: s_add_co_i32 s32, s2, 0x400
+; GFX12-NEXT: s_add_co_i32 s32, s2, 0x200
; GFX12-NEXT: scratch_store_b32 off, v0, s2
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -242,7 +233,7 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
; GFX12-NEXT: s_and_b32 s0, s0, -16
; GFX12-NEXT: s_mov_b32 s1, s32
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: s_lshl_b32 s0, s0, 6
+; GFX12-NEXT: s_lshl_b32 s0, s0, 5
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s32, s1, s0
; GFX12-NEXT: scratch_store_b32 off, v0, s1
@@ -284,50 +275,42 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshl_add_u32 v3, v8, 2, 15
+; GFX12-NEXT: v_lshl_add_u32 v2, v8, 2, 15
; GFX12-NEXT: s_mov_b32 s33, s32
; GFX12-NEXT: s_add_co_i32 s32, s32, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, -16, v3
-; GFX12-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX12-NEXT: v_and_b32_e32 v2, -16, v2
+; GFX12-NEXT: s_or_saveexec_b32 s2, -1
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v3, s[0:1]
-; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX12-NEXT: s_getpc_b64 s[2:3]
+; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v2, s2
+; GFX12-NEXT: s_getpc_b64 s[0:1]
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: s_sext_i32_i16 s3, s3
-; GFX12-NEXT: s_add_co_u32 s2, s2, foo at gotpcrel32@lo+12
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: s_add_co_u32 s0, s0, foo at gotpcrel32@lo+12
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo at gotpcrel32@hi+24
+; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo at gotpcrel32@hi+24
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX12-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX12-NEXT: v_add_nc_u32_e32 v2, 32, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX12-NEXT: v_mul_lo_u32 v2, 4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX12-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
-; GFX12-NEXT: ds_permute_b32 v1, v2, v0
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
-; GFX12-NEXT: s_mov_b64 exec, s[0:1]
-; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX12-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: s_or_saveexec_b32 s2, -1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_readlane_b32 s4, v0, 63
+; GFX12-NEXT: v_readlane_b32 s3, v0, 31
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: s_mov_b64 exec, s[2:3]
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_mov_b32 s2, s32
-; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: v_lshl_add_u32 v3, s4, 6, s2
-; GFX12-NEXT: scratch_store_b32 off, v4, s2
-; GFX12-NEXT: v_readfirstlane_b32 s32, v3
+; GFX12-NEXT: v_lshl_add_u32 v2, s3, 5, s2
+; GFX12-NEXT: scratch_store_b32 off, v3, s2
+; GFX12-NEXT: v_readfirstlane_b32 s32, v2
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -395,7 +378,7 @@ define amdgpu_cs_chain void @test_call_and_alloca() #0 {
; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo at gotpcrel32@hi+24
; GFX12-NEXT: s_mov_b32 s4, s32
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX12-NEXT: s_add_co_i32 s32, s4, 0x400
+; GFX12-NEXT: s_add_co_i32 s32, s4, 0x200
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -450,7 +433,7 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
; GFX12-NEXT: s_and_b32 s0, s0, -16
; GFX12-NEXT: s_mov_b32 s4, s32
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: s_lshl_b32 s0, s0, 6
+; GFX12-NEXT: s_lshl_b32 s0, s0, 5
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s32, s4, s0
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -493,54 +476,46 @@ define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshl_add_u32 v3, v8, 2, 15
+; GFX12-NEXT: v_lshl_add_u32 v2, v8, 2, 15
; GFX12-NEXT: s_mov_b32 s33, s32
; GFX12-NEXT: s_add_co_i32 s32, s32, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, -16, v3
-; GFX12-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX12-NEXT: v_and_b32_e32 v2, -16, v2
+; GFX12-NEXT: s_or_saveexec_b32 s2, -1
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v3, s[0:1]
-; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX12-NEXT: s_getpc_b64 s[2:3]
+; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v2, s2
+; GFX12-NEXT: s_getpc_b64 s[0:1]
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: s_sext_i32_i16 s3, s3
-; GFX12-NEXT: s_add_co_u32 s2, s2, foo at gotpcrel32@lo+12
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: s_add_co_u32 s0, s0, foo at gotpcrel32@lo+12
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo at gotpcrel32@hi+24
+; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo at gotpcrel32@hi+24
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX12-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX12-NEXT: v_add_nc_u32_e32 v2, 32, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX12-NEXT: v_mul_lo_u32 v2, 4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX12-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
-; GFX12-NEXT: ds_permute_b32 v1, v2, v0
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
-; GFX12-NEXT: s_mov_b64 exec, s[0:1]
-; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX12-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: s_or_saveexec_b32 s2, -1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_readlane_b32 s4, v0, 63
+; GFX12-NEXT: v_readlane_b32 s3, v0, 31
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: s_mov_b64 exec, s[2:3]
-; GFX12-NEXT: s_mov_b32 s5, s32
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_mov_b32 s4, s32
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT: v_lshl_add_u32 v3, s4, 6, s5
+; GFX12-NEXT: v_lshl_add_u32 v2, s3, 5, s4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_readfirstlane_b32 s32, v3
+; GFX12-NEXT: v_readfirstlane_b32 s32, v2
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v3, 0
-; GFX12-NEXT: scratch_store_b32 off, v3, s5
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: scratch_store_b32 off, v2, s4
; GFX12-NEXT: s_endpgm
;
; GFX942-LABEL: test_call_and_alloca_var:
diff --git b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir
@@ -1,8 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GFX900 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GFX942 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GFX12,GFX1200 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GFX12,GFX1250 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK %s
---
name: spill_v32
@@ -208,187 +205,3 @@ body: |
SI_SPILL_V64_SAVE undef $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
S_NOP 0, implicit undef $vgpr0_vgpr1
...
-
----
-name: spill_v128_kill_unaligned
-tracksRegLiveness: true
-stack:
- - { id: 0, type: spill-slot, size: 16, alignment: 4 }
-machineFunctionInfo:
- scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
- stackPtrOffsetReg: '$sgpr32'
- frameOffsetReg: '$sgpr33'
-body: |
- bb.0:
- liveins: $vgpr1_vgpr2_vgpr3_vgpr4
-
- ; GFX900-LABEL: name: spill_v128_kill_unaligned
- ; GFX900: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
- ; GFX900-NEXT: {{ $}}
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- ;
- ; GFX942-LABEL: name: spill_v128_kill_unaligned
- ; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
- ; GFX942-NEXT: {{ $}}
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- ;
- ; GFX1200-LABEL: name: spill_v128_kill_unaligned
- ; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
- ; GFX1200-NEXT: {{ $}}
- ; GFX1200-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr1_vgpr2_vgpr3_vgpr4, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
- ;
- ; GFX1250-LABEL: name: spill_v128_kill_unaligned
- ; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
- ; GFX1250-NEXT: {{ $}}
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- SI_SPILL_V128_SAVE killed $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
-...
-
----
-name: spill_v128_unaligned
-tracksRegLiveness: true
-stack:
- - { id: 0, type: spill-slot, size: 16, alignment: 4 }
-machineFunctionInfo:
- scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
- stackPtrOffsetReg: '$sgpr32'
- frameOffsetReg: '$sgpr33'
-body: |
- bb.0:
- liveins: $vgpr1_vgpr2_vgpr3_vgpr4
-
- ; GFX900-LABEL: name: spill_v128_unaligned
- ; GFX900: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
- ; GFX900-NEXT: {{ $}}
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- ;
- ; GFX942-LABEL: name: spill_v128_unaligned
- ; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
- ; GFX942-NEXT: {{ $}}
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- ;
- ; GFX1200-LABEL: name: spill_v128_unaligned
- ; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
- ; GFX1200-NEXT: {{ $}}
- ; GFX1200-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr1_vgpr2_vgpr3_vgpr4, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
- ;
- ; GFX1250-LABEL: name: spill_v128_unaligned
- ; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
- ; GFX1250-NEXT: {{ $}}
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
-...
-
----
-name: spill_v256_aligned
-tracksRegLiveness: true
-stack:
- - { id: 0, type: spill-slot, size: 16, alignment: 4 }
-machineFunctionInfo:
- scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
- stackPtrOffsetReg: '$sgpr32'
- frameOffsetReg: '$sgpr33'
-body: |
- bb.0:
- liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-
- ; GFX900-LABEL: name: spill_v256_aligned
- ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
- ; GFX900-NEXT: {{ $}}
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 16, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 20, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 24, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 28, addrspace 5)
- ;
- ; GFX942-LABEL: name: spill_v256_aligned
- ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
- ; GFX942-NEXT: {{ $}}
- ; GFX942-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s128) into %stack.0 + 16, align 4, addrspace 5)
- ;
- ; GFX12-LABEL: name: spill_v256_aligned
- ; GFX12: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
- ; GFX12-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s128) into %stack.0 + 16, align 4, addrspace 5)
- SI_SPILL_V256_SAVE $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
-...
-
----
-name: spill_v256_unaligned
-tracksRegLiveness: true
-stack:
- - { id: 0, type: spill-slot, size: 16, alignment: 4 }
-machineFunctionInfo:
- scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
- stackPtrOffsetReg: '$sgpr32'
- frameOffsetReg: '$sgpr33'
-body: |
- bb.0:
- liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
-
- ; GFX900-LABEL: name: spill_v256_unaligned
- ; GFX900: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
- ; GFX900-NEXT: {{ $}}
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 16, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 20, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 24, addrspace 5)
- ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 28, addrspace 5)
- ;
- ; GFX942-LABEL: name: spill_v256_unaligned
- ; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
- ; GFX942-NEXT: {{ $}}
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 16, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr6, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 20, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr7, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 24, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 28, addrspace 5)
- ;
- ; GFX1200-LABEL: name: spill_v256_unaligned
- ; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
- ; GFX1200-NEXT: {{ $}}
- ; GFX1200-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr1_vgpr2_vgpr3_vgpr4, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
- ; GFX1200-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr5_vgpr6_vgpr7_vgpr8, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s128) into %stack.0 + 16, align 4, addrspace 5)
- ;
- ; GFX1250-LABEL: name: spill_v256_unaligned
- ; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
- ; GFX1250-NEXT: {{ $}}
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 16, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr6, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 20, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr7, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 24, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 28, addrspace 5)
- SI_SPILL_V256_SAVE $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
-...
```
https://github.com/llvm/llvm-project/pull/183153
More information about the llvm-branch-commits
mailing list