[llvm] 1dd23c6 - AMDGPU: Allow tail calls for amdgpu_gfx functions
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 22 07:55:25 PDT 2021
Author: Matt Arsenault
Date: 2021-03-22T10:55:19-04:00
New Revision: 1dd23c6d53cc3caa78cb6461f226ad54c1805ecc
URL: https://github.com/llvm/llvm-project/commit/1dd23c6d53cc3caa78cb6461f226ad54c1805ecc
DIFF: https://github.com/llvm/llvm-project/commit/1dd23c6d53cc3caa78cb6461f226ad54c1805ecc.diff
LOG: AMDGPU: Allow tail calls for amdgpu_gfx functions
Added:
llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 124f7449bc27..b23248b21793 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2829,6 +2829,7 @@ static bool canGuaranteeTCO(CallingConv::ID CC) {
static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
+ case CallingConv::AMDGPU_Gfx:
return true;
default:
return canGuaranteeTCO(CC);
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index c1f270143feb..15eefaebd4bf 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -3294,66 +3294,30 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-LABEL: tail_call_byval_align16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:8
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:12
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_add_u32 s32, s32, 0x800
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_readlane_b32 s4, v40, 0
-; GFX9-NEXT: v_readlane_b32 s5, v40, 1
-; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[4:5]
;
; GFX10-LABEL: tail_call_byval_align16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_add_u32 s32, s32, 0x400
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:12
-; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: v_readlane_b32 s4, v40, 0
-; GFX10-NEXT: v_readlane_b32 s5, v40, 1
-; GFX10-NEXT: s_sub_u32 s32, s32, 0x400
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
-; GFX10-NEXT: s_or_saveexec_b32 s6, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[4:5]
entry:
%alloca = alloca double, align 8, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
new file mode 100644
index 000000000000..75850a933b3e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -enable-var-scope %s
+
+; Callee with SGPR and VGPR arguments
+define hidden amdgpu_gfx float @callee(float %v.arg0, float inreg %s.arg1) {
+; GCN-LABEL: callee:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_add_f32_e32 v0, s4, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %add = fadd float %v.arg0, %s.arg1
+ ret float %add
+}
+
+define amdgpu_gfx float @caller(float %arg0) {
+; GCN-LABEL: caller:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT: s_mov_b32 s4, 2.0
+; GCN-NEXT: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, callee at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, callee at rel32@hi+12
+; GCN-NEXT: s_setpc_b64 s[6:7]
+ %add = fadd float %arg0, 1.0
+ %call = tail call amdgpu_gfx float @callee(float %add, float 2.0)
+ ret float %call
+}
More information about the llvm-commits
mailing list