[llvm] [AMDGPU] Don't DEALLOC_VGPRS from callable functions (PR #72245)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 15 06:16:32 PST 2023
https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/72245
>From 48b3801ed3a1d44e36aed980a053e67f274fd853 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Wed, 8 Nov 2023 08:57:45 +0100
Subject: [PATCH 1/2] [AMDGPU] Don't DEALLOC_VGPRS from callable functions
Callable functions should not send the DEALLOC_VGPRS message, because
that might release the VGPRs and scratch allocation before the caller is
done with them.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 9 +++++--
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 +++
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 5 ++++
llvm/test/CodeGen/AMDGPU/release-vgprs.mir | 26 +++++++++++++++++++
4 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7d..d862b37443aec83 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1027,6 +1027,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.VmCnt = 0;
}
+ CallingConv::ID CC = MI.getMF()->getFunction().getCallingConv();
+
// All waits must be resolved at call return.
// NOTE: this could be improved with knowledge of all call sites or
// with knowledge of the called routines.
@@ -1039,10 +1041,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
// stores. In this case it can be useful to send a message to explicitly
// release all VGPRs before the stores have completed, but it is only safe to
- // do this if there are no outstanding scratch stores.
+ // do this if:
+ // * there are no outstanding scratch stores
+ // * this is not a callable function
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
- if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
+ if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+ !AMDGPU::isCallableCC(CC) && !OptNone &&
ScoreBrackets.getScoreRange(VS_CNT) != 0 &&
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
ReleaseVGPRInsts.insert(&MI);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a09abc639d7590f..5ea135c7b90dd62 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1924,6 +1924,10 @@ bool isChainCC(CallingConv::ID CC) {
}
}
+bool isCallableCC(CallingConv::ID CC) {
+ return !isEntryFunctionCC(CC) && !isChainCC(CC);
+}
+
bool isKernelCC(const Function *Func) {
return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1e0994d0862cf5d..965414019263448 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1114,6 +1114,11 @@ bool isModuleEntryFunctionCC(CallingConv::ID CC);
LLVM_READNONE
bool isChainCC(CallingConv::ID CC);
+// Functions that are called via the 'call' instruction, rather than launched
+// by the hardware or via the 'llvm.amdgcn.cs.chain' intrinsic.
+LLVM_READNONE
+bool isCallableCC(CallingConv::ID CC);
+
bool isKernelCC(const Function *Func);
// FIXME: Remove this when calling conventions cleaned up
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index 3a879e818af797b..39ced04253b571a 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -22,6 +22,8 @@
define amdgpu_ps void @global_atomic() { ret void }
define amdgpu_ps void @image_atomic() { ret void }
define amdgpu_ps void @global_store_optnone() noinline optnone { ret void }
+ define amdgpu_gfx void @gfx_function() { ret void }
+ define void @ccc_function() { ret void }
...
---
@@ -556,3 +558,27 @@ body: |
S_WAITCNT_VSCNT undef $sgpr_null, 0
S_ENDPGM 0
...
+
+---
+name: gfx_function
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: gfx_function
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ENDPGM 0
+ GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+ S_WAITCNT_VSCNT undef $sgpr_null, 0
+ S_ENDPGM 0
+...
+
+---
+name: ccc_function
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: ccc_function
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ENDPGM 0
+ GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+ S_WAITCNT_VSCNT undef $sgpr_null, 0
+ S_ENDPGM 0
+...
>From 39f5b846aebc7a5ef44fab4797c469a7b60f11ef Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Wed, 8 Nov 2023 08:57:45 +0100
Subject: [PATCH 2/2] Address comments + minor cleanup
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 7 ++---
.../llvm.amdgcn.set.inactive.chain.arg.ll | 28 -------------------
llvm/test/CodeGen/AMDGPU/release-vgprs.mir | 2 --
3 files changed, 3 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index d862b37443aec83..c9b095456971fb8 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1041,13 +1041,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
// stores. In this case it can be useful to send a message to explicitly
// release all VGPRs before the stores have completed, but it is only safe to
- // do this if:
- // * there are no outstanding scratch stores
- // * this is not a callable function
+ // do this if there are no outstanding scratch stores (either from the current
+ // function or potentially from a caller or callee).
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
- !AMDGPU::isCallableCC(CC) && !OptNone &&
+ AMDGPU::isEntryFunctionCC(CC) && !OptNone &&
ScoreBrackets.getScoreRange(VS_CNT) != 0 &&
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
ReleaseVGPRInsts.insert(&MI);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index 6fd6d6e2e31a1c2..65b70587fa0ace5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -17,8 +17,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
; GFX11-NEXT: v_mov_b32_e32 v0, v10
; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11-NEXT: global_store_b32 v[8:9], v0, off
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX10-LABEL: set_inactive_chain_arg:
@@ -39,8 +37,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10
; GFX11_W64-NEXT: s_not_b64 exec, exec
; GFX11_W64-NEXT: global_store_b32 v[8:9], v0, off
-; GFX11_W64-NEXT: s_nop 0
-; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11_W64-NEXT: s_endpgm
;
; GFX10_W64-LABEL: set_inactive_chain_arg:
@@ -68,8 +64,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
; GFX11-NEXT: v_mov_b32_e32 v1, v11
; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11-NEXT: global_store_b64 v[8:9], v[0:1], off
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX10-LABEL: set_inactive_chain_arg_64:
@@ -94,8 +88,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
; GFX11_W64-NEXT: v_mov_b32_e32 v1, v11
; GFX11_W64-NEXT: s_not_b64 exec, exec
; GFX11_W64-NEXT: global_store_b64 v[8:9], v[0:1], off
-; GFX11_W64-NEXT: s_nop 0
-; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11_W64-NEXT: s_endpgm
;
; GFX10_W64-LABEL: set_inactive_chain_arg_64:
@@ -133,8 +125,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v2, v1
; GFX11-NEXT: global_store_b32 v[8:9], v2, off
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX10-LABEL: set_inactive_chain_arg_dpp:
@@ -174,8 +164,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11_W64-NEXT: v_mov_b32_e32 v2, v1
; GFX11_W64-NEXT: global_store_b32 v[8:9], v2, off
-; GFX11_W64-NEXT: s_nop 0
-; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11_W64-NEXT: s_endpgm
;
; GFX10_W64-LABEL: set_inactive_chain_arg_dpp:
@@ -233,8 +221,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL11-NEXT: v_mov_b32_e32 v0, v12
; GISEL11-NEXT: global_store_b32 v[41:42], v0, off
-; GISEL11-NEXT: s_nop 0
-; GISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL11-NEXT: s_endpgm
;
; DAGISEL11-LABEL: set_inactive_chain_arg_call:
@@ -265,8 +251,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off
-; DAGISEL11-NEXT: s_nop 0
-; DAGISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; DAGISEL11-NEXT: s_endpgm
;
; GISEL10-LABEL: set_inactive_chain_arg_call:
@@ -380,8 +364,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
-; GISEL11_W64-NEXT: s_nop 0
-; GISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL11_W64-NEXT: s_endpgm
;
; DAGISEL11_W64-LABEL: set_inactive_chain_arg_call:
@@ -419,8 +401,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
-; DAGISEL11_W64-NEXT: s_nop 0
-; DAGISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; DAGISEL11_W64-NEXT: s_endpgm
;
; GISEL10_W64-LABEL: set_inactive_chain_arg_call:
@@ -538,8 +518,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL11-NEXT: v_mov_b32_e32 v0, v12
; GISEL11-NEXT: global_store_b32 v[41:42], v0, off
-; GISEL11-NEXT: s_nop 0
-; GISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL11-NEXT: s_endpgm
;
; DAGISEL11-LABEL: set_inactive_chain_arg_last_vgpr:
@@ -570,8 +548,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off
-; DAGISEL11-NEXT: s_nop 0
-; DAGISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; DAGISEL11-NEXT: s_endpgm
;
; GISEL10-LABEL: set_inactive_chain_arg_last_vgpr:
@@ -685,8 +661,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
-; GISEL11_W64-NEXT: s_nop 0
-; GISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL11_W64-NEXT: s_endpgm
;
; DAGISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr:
@@ -724,8 +698,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
-; DAGISEL11_W64-NEXT: s_nop 0
-; DAGISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; DAGISEL11_W64-NEXT: s_endpgm
;
; GISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index 39ced04253b571a..6366485874c1164 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -567,7 +567,6 @@ body: |
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ENDPGM 0
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
- S_WAITCNT_VSCNT undef $sgpr_null, 0
S_ENDPGM 0
...
@@ -579,6 +578,5 @@ body: |
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ENDPGM 0
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
- S_WAITCNT_VSCNT undef $sgpr_null, 0
S_ENDPGM 0
...
More information about the llvm-commits
mailing list