[llvm] [AMDGPU] Don't DEALLOC_VGPRS from callable functions (PR #72245)

via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 15 06:16:32 PST 2023


https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/72245

>From 48b3801ed3a1d44e36aed980a053e67f274fd853 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Wed, 8 Nov 2023 08:57:45 +0100
Subject: [PATCH 1/2] [AMDGPU] Don't DEALLOC_VGPRS from callable functions

Callable functions should not send the DEALLOC_VGPRS message, because
that might release the VGPRs and scratch allocation before the caller is
done with them.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  9 +++++--
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  4 +++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  5 ++++
 llvm/test/CodeGen/AMDGPU/release-vgprs.mir    | 26 +++++++++++++++++++
 4 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7d..d862b37443aec83 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1027,6 +1027,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
     Wait.VmCnt = 0;
   }
 
+  CallingConv::ID CC = MI.getMF()->getFunction().getCallingConv();
+
   // All waits must be resolved at call return.
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
@@ -1039,10 +1041,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
   // stores. In this case it can be useful to send a message to explicitly
   // release all VGPRs before the stores have completed, but it is only safe to
-  // do this if there are no outstanding scratch stores.
+  // do this if:
+  // * there are no outstanding scratch stores
+  // * this is not a callable function
   else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
-    if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
+    if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+        !AMDGPU::isCallableCC(CC) && !OptNone &&
         ScoreBrackets.getScoreRange(VS_CNT) != 0 &&
         !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
       ReleaseVGPRInsts.insert(&MI);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a09abc639d7590f..5ea135c7b90dd62 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1924,6 +1924,10 @@ bool isChainCC(CallingConv::ID CC) {
   }
 }
 
+bool isCallableCC(CallingConv::ID CC) {
+  return !isEntryFunctionCC(CC) && !isChainCC(CC);
+}
+
 bool isKernelCC(const Function *Func) {
   return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1e0994d0862cf5d..965414019263448 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1114,6 +1114,11 @@ bool isModuleEntryFunctionCC(CallingConv::ID CC);
 LLVM_READNONE
 bool isChainCC(CallingConv::ID CC);
 
+// Functions that are called via the 'call' instruction, rather than launched
+// by the hardware or via the 'llvm.amdgcn.cs.chain' intrinsic.
+LLVM_READNONE
+bool isCallableCC(CallingConv::ID CC);
+
 bool isKernelCC(const Function *Func);
 
 // FIXME: Remove this when calling conventions cleaned up
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index 3a879e818af797b..39ced04253b571a 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -22,6 +22,8 @@
   define amdgpu_ps void @global_atomic() { ret void }
   define amdgpu_ps void @image_atomic() { ret void }
   define amdgpu_ps void @global_store_optnone() noinline optnone { ret void }
+  define amdgpu_gfx void @gfx_function() { ret void }
+  define void @ccc_function() { ret void }
 ...
 
 ---
@@ -556,3 +558,27 @@ body:             |
     S_WAITCNT_VSCNT undef $sgpr_null, 0
     S_ENDPGM 0
 ...
+
+---
+name:            gfx_function
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: gfx_function
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ENDPGM 0
+    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    S_WAITCNT_VSCNT undef $sgpr_null, 0
+    S_ENDPGM 0
+...
+
+---
+name:            ccc_function
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: ccc_function
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ENDPGM 0
+    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    S_WAITCNT_VSCNT undef $sgpr_null, 0
+    S_ENDPGM 0
+...

>From 39f5b846aebc7a5ef44fab4797c469a7b60f11ef Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Wed, 8 Nov 2023 08:57:45 +0100
Subject: [PATCH 2/2] Address comments + minor cleanup

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  7 ++---
 .../llvm.amdgcn.set.inactive.chain.arg.ll     | 28 -------------------
 llvm/test/CodeGen/AMDGPU/release-vgprs.mir    |  2 --
 3 files changed, 3 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index d862b37443aec83..c9b095456971fb8 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1041,13 +1041,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
   // stores. In this case it can be useful to send a message to explicitly
   // release all VGPRs before the stores have completed, but it is only safe to
-  // do this if:
-  // * there are no outstanding scratch stores
-  // * this is not a callable function
+  // do this if there are no outstanding scratch stores (either from the current
+  // function or potentially from a caller or callee).
   else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
     if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
-        !AMDGPU::isCallableCC(CC) && !OptNone &&
+        AMDGPU::isEntryFunctionCC(CC) && !OptNone &&
         ScoreBrackets.getScoreRange(VS_CNT) != 0 &&
         !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
       ReleaseVGPRInsts.insert(&MI);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index 6fd6d6e2e31a1c2..65b70587fa0ace5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -17,8 +17,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    global_store_b32 v[8:9], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: set_inactive_chain_arg:
@@ -39,8 +37,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    global_store_b32 v[8:9], v0, off
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: set_inactive_chain_arg:
@@ -68,8 +64,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: set_inactive_chain_arg_64:
@@ -94,8 +88,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: set_inactive_chain_arg_64:
@@ -133,8 +125,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-NEXT:    global_store_b32 v[8:9], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: set_inactive_chain_arg_dpp:
@@ -174,8 +164,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11_W64-NEXT:    global_store_b32 v[8:9], v2, off
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: set_inactive_chain_arg_dpp:
@@ -233,8 +221,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11-NEXT:    s_nop 0
-; GISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11-NEXT:    s_endpgm
 ;
 ; DAGISEL11-LABEL: set_inactive_chain_arg_call:
@@ -265,8 +251,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11-NEXT:    s_nop 0
-; DAGISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11-NEXT:    s_endpgm
 ;
 ; GISEL10-LABEL: set_inactive_chain_arg_call:
@@ -380,8 +364,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11_W64-NEXT:    s_nop 0
-; GISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11_W64-NEXT:    s_endpgm
 ;
 ; DAGISEL11_W64-LABEL: set_inactive_chain_arg_call:
@@ -419,8 +401,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11_W64-NEXT:    s_nop 0
-; DAGISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11_W64-NEXT:    s_endpgm
 ;
 ; GISEL10_W64-LABEL: set_inactive_chain_arg_call:
@@ -538,8 +518,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11-NEXT:    s_nop 0
-; GISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11-NEXT:    s_endpgm
 ;
 ; DAGISEL11-LABEL: set_inactive_chain_arg_last_vgpr:
@@ -570,8 +548,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11-NEXT:    s_nop 0
-; DAGISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11-NEXT:    s_endpgm
 ;
 ; GISEL10-LABEL: set_inactive_chain_arg_last_vgpr:
@@ -685,8 +661,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11_W64-NEXT:    s_nop 0
-; GISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11_W64-NEXT:    s_endpgm
 ;
 ; DAGISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr:
@@ -724,8 +698,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11_W64-NEXT:    s_nop 0
-; DAGISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11_W64-NEXT:    s_endpgm
 ;
 ; GISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index 39ced04253b571a..6366485874c1164 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -567,7 +567,6 @@ body:             |
     ; CHECK-NOT: S_SENDMSG 3
     ; CHECK: S_ENDPGM 0
     GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
-    S_WAITCNT_VSCNT undef $sgpr_null, 0
     S_ENDPGM 0
 ...
 
@@ -579,6 +578,5 @@ body:             |
     ; CHECK-NOT: S_SENDMSG 3
     ; CHECK: S_ENDPGM 0
     GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
-    S_WAITCNT_VSCNT undef $sgpr_null, 0
     S_ENDPGM 0
 ...



More information about the llvm-commits mailing list