[llvm] AMDGPU: Fix verifier error on tail call target in vgprs (PR #110984)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 3 04:33:19 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
We allow tail calls of known uniform function pointers. This
would produce a verifier error if the uniform value is in VGPRs.
Insert readfirstlanes just in case this occurs, which will fold
out later if it is unnecessary.
GlobalISel should need a similar fix, but it currently does not
attempt tail calls of indirect calls.
Fixes subissue of #<!-- -->110930
---
Full diff: https://github.com/llvm/llvm-project/pull/110984.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+20)
- (added) llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll (+58)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5e4cf705cc9e47..ecab7c4e22aa02 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3565,6 +3565,7 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
if (IsVarArg)
return false;
+ // FIXME: We need to know all arguments passed in SGPR are uniform.
for (const Argument &Arg : CallerF.args()) {
if (Arg.hasByValAttr())
return false;
@@ -3877,6 +3878,25 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
std::vector<SDValue> Ops;
Ops.push_back(Chain);
+
+ if (IsTailCall) {
+ // isEligibleForTailCallOptimization considered whether the call target is
+ // divergent, but we may still end up with a uniform value in a VGPR. Insert
+ // a readfirstlane just in case.
+ SDValue ReadFirstLaneID =
+ DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
+
+ SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
+ if (CLI.ConvergenceControlToken) {
+ SDValue TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue,
+ CLI.ConvergenceControlToken);
+ ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
+ }
+
+ Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
+ ReadfirstlaneArgs);
+ }
+
Ops.push_back(Callee);
// Add a redundant copy of the callee global which will not be legalized, as
// we need direct access to the callee later.
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll
new file mode 100644
index 00000000000000..60fe7c47e0df79
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; The tail call target is known uniform, but will be in a VGPR, so we
+; need readfirstlane to legalize it.
+define void @tail_call_uniform_vgpr_value() {
+; CHECK-LABEL: tail_call_uniform_vgpr_value:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: ds_read_b64 v[0:1], v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s17, v1
+; CHECK-NEXT: v_readfirstlane_b32 s16, v0
+; CHECK-NEXT: s_setpc_b64 s[16:17]
+ %fptr = load ptr, ptr addrspace(3) null, align 8
+ tail call void %fptr()
+ ret void
+}
+
+ at constant = external hidden addrspace(4) constant ptr
+
+; readfirstlanes should fold out.
+define void @tail_call_uniform_sgpr_value() {
+; CHECK-LABEL: tail_call_uniform_sgpr_value:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, constant at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, constant at rel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[16:17]
+ %fptr = load ptr, ptr addrspace(4) @constant, align 8
+ tail call void %fptr()
+ ret void
+}
+
+define void @tail_call_uniform_vgpr_value_convergence_tokens() #0 {
+; CHECK-LABEL: tail_call_uniform_vgpr_value_convergence_tokens:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: ds_read_b64 v[0:1], v0
+; CHECK-NEXT: ; meta instruction
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s19, v1
+; CHECK-NEXT: v_readfirstlane_b32 s18, v0
+; CHECK-NEXT: s_setpc_b64 s[18:19]
+ %t = call token @llvm.experimental.convergence.entry()
+ %fptr = load ptr, ptr addrspace(3) null, align 8
+ tail call void %fptr() #0 [ "convergencectrl"(token %t) ]
+ ret void
+}
+
+attributes #0 = { convergent }
``````````
</details>
https://github.com/llvm/llvm-project/pull/110984
More information about the llvm-commits
mailing list