[llvm] 16945bc - [AMDGPU] Don't send DEALLOC_VGPRs after calls (#77439)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 11 00:14:57 PST 2024
Author: Diana Picus
Date: 2024-01-11T09:14:52+01:00
New Revision: 16945bc16dbb4c4acac854001b73e1454f3b601c
URL: https://github.com/llvm/llvm-project/commit/16945bc16dbb4c4acac854001b73e1454f3b601c
DIFF: https://github.com/llvm/llvm-project/commit/16945bc16dbb4c4acac854001b73e1454f3b601c.diff
LOG: [AMDGPU] Don't send DEALLOC_VGPRs after calls (#77439)
Calls do not have to wait for VsCnt, so after they return there might
still be scratch stores in progress. It's important that we don't send
the DEALLOC_VGPR message in that case, since that might release the
VGPRs and scratch allocation before those stores are complete.
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/test/CodeGen/AMDGPU/call-argument-types.ll
llvm/test/CodeGen/AMDGPU/calling-conventions.ll
llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
llvm/test/CodeGen/AMDGPU/release-vgprs.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1cb1d32707f2d7..1f480c248154e3 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -292,7 +292,7 @@ class WaitcntBrackets {
VgprVmemTypes[GprNo] = 0;
}
- void setNonKernelFunctionInitialState() {
+ void setStateOnFunctionEntryOrReturn() {
setScoreUB(VS_CNT, getWaitCountMax(VS_CNT));
PendingEvents |= WaitEventMaskForInst[VS_CNT];
}
@@ -1487,6 +1487,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
if (callWaitsOnFunctionReturn(Inst)) {
// Act as a wait on everything
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
+ ScoreBrackets->setStateOnFunctionEntryOrReturn();
} else {
// May need to way wait for anything.
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
@@ -1879,7 +1880,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
auto NonKernelInitialState =
std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
- NonKernelInitialState->setNonKernelFunctionInitialState();
+ NonKernelInitialState->setStateOnFunctionEntryOrReturn();
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
Modified = true;
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index a192a1b8dff935..87e17a1c82080b 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4462,8 +4462,6 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: buffer_store_b32 v0, off, s[36:39], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_i32_func_i32_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index d63ebdeb50a1f9..ce1ce649c227d2 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -167,8 +167,6 @@ define amdgpu_kernel void @call_coldcc() #0 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%val = call float @coldcc(float 1.0)
store float %val, ptr addrspace(1) undef
@@ -231,8 +229,6 @@ define amdgpu_kernel void @call_fastcc() #0 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%val = call float @fastcc(float 1.0)
store float %val, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 5ebd3eef69f257..499046a2e222d3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -626,8 +626,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1164-NEXT: .LBB1_4:
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -675,8 +673,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1132-NEXT: .LBB1_4:
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -988,8 +984,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB1_2:
-; GFX1164-DPP-NEXT: s_nop 0
-; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -1051,8 +1045,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB1_2:
-; GFX1132-DPP-NEXT: s_nop 0
-; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
@@ -3042,8 +3034,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1164-NEXT: .LBB5_4:
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -3091,8 +3081,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1132-NEXT: .LBB5_4:
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -3404,8 +3392,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB5_2:
-; GFX1164-DPP-NEXT: s_nop 0
-; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -3467,8 +3453,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB5_2:
-; GFX1132-DPP-NEXT: s_nop 0
-; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
@@ -3770,8 +3754,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1164-NEXT: .LBB6_4:
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -3819,8 +3801,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1132-NEXT: .LBB6_4:
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -4132,8 +4112,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB6_2:
-; GFX1164-DPP-NEXT: s_nop 0
-; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -4195,8 +4173,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB6_2:
-; GFX1132-DPP-NEXT: s_nop 0
-; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 8081d40f7e665c..b6afb7cf8c9a11 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -300,8 +300,6 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -930,8 +928,6 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -1294,8 +1290,6 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add3_u32 v0, v3, v1, v0
; GFX11-NEXT: global_store_b32 v6, v0, s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -1543,8 +1537,6 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -1753,8 +1745,6 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v0, v2, v0, v3
; GFX11-NEXT: global_store_b32 v6, v0, s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -2017,8 +2007,6 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v12, v[0:1], s[36:37]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
ptr addrspace(1) %buffer2) {
entry:
@@ -2349,8 +2337,6 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -2553,8 +2539,6 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0) #2
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index 07f8567ac06821..e80585299b91a5 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -22,6 +22,8 @@
define amdgpu_ps void @global_atomic() { ret void }
define amdgpu_ps void @image_atomic() { ret void }
define amdgpu_ps void @global_store_optnone() noinline optnone { ret void }
+ define amdgpu_cs void @with_calls() { ret void }
+ define fastcc void @with_tail_calls() { ret void }
...
---
@@ -565,3 +567,33 @@ body: |
S_WAITCNT_VSCNT undef $sgpr_null, 0
S_ENDPGM 0
...
+
+---
+name: with_calls
+frameInfo:
+ hasCalls: true
+body: |
+ bb.0:
+ ; Make sure we don't send DEALLOC_VGPRS after a call, since there might be
+ ; scratch stores still in progress.
+ ; CHECK-LABEL: name: with_calls
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ENDPGM 0
+ GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+ $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu
+ S_ENDPGM 0
+...
+
+---
+name: with_tail_calls
+frameInfo:
+ hasCalls: true
+body: |
+ bb.0:
+ ; Make sure we don't send DEALLOC_VGPRS when there's a tail call, since the
+ ; only valid action after DEALLOC_VGPRS is to terminate the wave.
+ ; CHECK-LABEL: name: with_tail_calls
+ ; CHECK-NOT: S_SENDMSG 3
+ GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+ SI_TCRETURN undef renamable $sgpr4_sgpr5, @with_tail_calls, 0, csr_amdgpu
+...
More information about the llvm-commits
mailing list