[llvm] [AMDGPU] Don't send DEALLOC_VGPRs after calls (PR #77439)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 9 02:33:43 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Diana Picus (rovka)
<details>
<summary>Changes</summary>
Calls do not have to wait for VsCnt, so after they return there might still be scratch stores in progress. It's important that we don't send the DEALLOC_VGPR message in that case, since that might release the VGPRs and scratch allocation before those stores are complete.
---
Full diff: https://github.com/llvm/llvm-project/pull/77439.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+4)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (-2)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (-4)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (-24)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (-16)
- (modified) llvm/test/CodeGen/AMDGPU/release-vgprs.mir (+17)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1cb1d32707f2d7..24b13673690ddb 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -252,6 +252,7 @@ class WaitcntBrackets {
const MachineRegisterInfo *MRI, WaitEventType E,
MachineInstr &MI);
+ void setPendingEvent(WaitEventType E) { PendingEvents |= 1 << E; }
unsigned hasPendingEvent() const { return PendingEvents; }
unsigned hasPendingEvent(WaitEventType E) const {
return PendingEvents & (1 << E);
@@ -1487,6 +1488,9 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
if (callWaitsOnFunctionReturn(Inst)) {
// Act as a wait on everything
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
+ // Since there's no guaranteed wait on VsCnt, scratch stores might still
+ // be in flight.
+ ScoreBrackets->setPendingEvent(SCRATCH_WRITE_ACCESS);
} else {
// May need to way wait for anything.
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index a192a1b8dff935..87e17a1c82080b 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4462,8 +4462,6 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: buffer_store_b32 v0, off, s[36:39], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_i32_func_i32_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index d63ebdeb50a1f9..ce1ce649c227d2 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -167,8 +167,6 @@ define amdgpu_kernel void @call_coldcc() #0 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%val = call float @coldcc(float 1.0)
store float %val, ptr addrspace(1) undef
@@ -231,8 +229,6 @@ define amdgpu_kernel void @call_fastcc() #0 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%val = call float @fastcc(float 1.0)
store float %val, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 5ebd3eef69f257..499046a2e222d3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -626,8 +626,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1164-NEXT: .LBB1_4:
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -675,8 +673,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1132-NEXT: .LBB1_4:
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -988,8 +984,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB1_2:
-; GFX1164-DPP-NEXT: s_nop 0
-; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -1051,8 +1045,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB1_2:
-; GFX1132-DPP-NEXT: s_nop 0
-; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
@@ -3042,8 +3034,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1164-NEXT: .LBB5_4:
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -3091,8 +3081,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1132-NEXT: .LBB5_4:
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -3404,8 +3392,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB5_2:
-; GFX1164-DPP-NEXT: s_nop 0
-; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -3467,8 +3453,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB5_2:
-; GFX1132-DPP-NEXT: s_nop 0
-; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
@@ -3770,8 +3754,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1164-NEXT: .LBB6_4:
-; GFX1164-NEXT: s_nop 0
-; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -3819,8 +3801,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1132-NEXT: .LBB6_4:
-; GFX1132-NEXT: s_nop 0
-; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -4132,8 +4112,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB6_2:
-; GFX1164-DPP-NEXT: s_nop 0
-; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -4195,8 +4173,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB6_2:
-; GFX1132-DPP-NEXT: s_nop 0
-; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 8081d40f7e665c..b6afb7cf8c9a11 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -300,8 +300,6 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -930,8 +928,6 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -1294,8 +1290,6 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add3_u32 v0, v3, v1, v0
; GFX11-NEXT: global_store_b32 v6, v0, s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -1543,8 +1537,6 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -1753,8 +1745,6 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v0, v2, v0, v3
; GFX11-NEXT: global_store_b32 v6, v0, s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -2017,8 +2007,6 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v12, v[0:1], s[36:37]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
ptr addrspace(1) %buffer2) {
entry:
@@ -2349,8 +2337,6 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -2553,8 +2539,6 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0) #2
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index 07f8567ac06821..f4c8645f6edb6e 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -22,6 +22,7 @@
define amdgpu_ps void @global_atomic() { ret void }
define amdgpu_ps void @image_atomic() { ret void }
define amdgpu_ps void @global_store_optnone() noinline optnone { ret void }
+ define amdgpu_cs void @with_calls() { ret void }
...
---
@@ -565,3 +566,19 @@ body: |
S_WAITCNT_VSCNT undef $sgpr_null, 0
S_ENDPGM 0
...
+
+---
+name: with_calls
+frameInfo:
+ hasCalls: true
+body: |
+ bb.0:
+ ; Make sure we don't send DEALLOC_VGPRS after a call, since there might be
+ ; scratch stores still in progress.
+ ; CHECK-LABEL: name: with_calls
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ENDPGM 0
+ GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+ $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu
+ S_ENDPGM 0
+...
``````````
</details>
https://github.com/llvm/llvm-project/pull/77439
More information about the llvm-commits
mailing list