[llvm] c08d7b3 - AMDGPU: Fix verifier error on tail call target in vgprs (#110984)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 3 10:51:00 PDT 2024
Author: Matt Arsenault
Date: 2024-10-03T21:50:56+04:00
New Revision: c08d7b3de7409aecadd7f9edfe0f3a1ce28a6374
URL: https://github.com/llvm/llvm-project/commit/c08d7b3de7409aecadd7f9edfe0f3a1ce28a6374
DIFF: https://github.com/llvm/llvm-project/commit/c08d7b3de7409aecadd7f9edfe0f3a1ce28a6374.diff
LOG: AMDGPU: Fix verifier error on tail call target in vgprs (#110984)
We allow tail calls of known uniform function pointers. This
would produce a verifier error if the uniform value is in VGPRs.
Insert readfirstlanes just in case this occurs, which will fold
out later if it is unnecessary.
GlobalISel should need a similar fix, but it currently does not
attempt tail calls of indirect calls.
Fixes #107447
Fixes subissue of #110930
Added:
llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 44d7804647fa02..cad9d17fd7d16e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3565,6 +3565,7 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
if (IsVarArg)
return false;
+ // FIXME: We need to know all arguments passed in SGPR are uniform.
for (const Argument &Arg : CallerF.args()) {
if (Arg.hasByValAttr())
return false;
@@ -3875,15 +3876,37 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
InGlue = Chain.getValue(1);
}
- std::vector<SDValue> Ops;
- Ops.push_back(Chain);
- Ops.push_back(Callee);
+ std::vector<SDValue> Ops({Chain});
+
// Add a redundant copy of the callee global which will not be legalized, as
// we need direct access to the callee later.
if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = GSD->getGlobal();
+ Ops.push_back(Callee);
Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
} else {
+ if (IsTailCall) {
+ assert(!Callee->isDivergent() &&
+ "cannot tail call a divergent call target");
+
+ // isEligibleForTailCallOptimization considered whether the call target is
+ // divergent, but we may still end up with a uniform value in a VGPR.
+ // Insert a readfirstlane just in case.
+ SDValue ReadFirstLaneID =
+ DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
+
+ SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
+ if (CLI.ConvergenceControlToken) {
+ SDValue TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {},
+ MVT::Glue, CLI.ConvergenceControlToken);
+ ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
+ }
+
+ Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
+ ReadfirstlaneArgs);
+ }
+
+ Ops.push_back(Callee);
Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index 05fd8837684801..84dce0ffb31e8f 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -486,7 +486,14 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
@@ -495,7 +502,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], 0, 0, killed [[S_MOV_B32_]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+ ; DAGISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE1]], 0, 0, killed [[S_MOV_B32_]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
;
; DAGISEL-GFX10-LABEL: name: indirect
; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -510,10 +517,17 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY13]]
; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
@@ -521,7 +535,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], 0, 0, killed [[S_MOV_B32_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+ ; DAGISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE1]], 0, 0, killed [[S_MOV_B32_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
unreachable
}
@@ -697,7 +711,14 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i
; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
@@ -705,7 +726,7 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], 0, 0, [[COPY7]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+ ; DAGISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE1]], 0, 0, [[COPY7]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
;
; DAGISEL-GFX10-LABEL: name: indirect_with_non_imm_exec
; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -721,9 +742,16 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i
; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY14]]
; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
@@ -731,7 +759,7 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], 0, 0, [[COPY7]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+ ; DAGISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE1]], 0, 0, [[COPY7]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
unreachable
}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
index 28492f1426921d..72b2654257a936 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -486,7 +486,14 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
@@ -495,7 +502,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], 0, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+ ; DAGISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE1]], 0, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
;
; DAGISEL-GFX10-LABEL: name: indirect
; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -510,10 +517,17 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY13]]
; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
@@ -521,7 +535,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], 0, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+ ; DAGISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE1]], 0, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
unreachable
}
@@ -711,7 +725,14 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i
; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
@@ -719,7 +740,7 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE1]], 0, 0, killed [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+ ; DAGISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE2]], 0, 0, killed [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
;
; DAGISEL-GFX10-LABEL: name: indirect_with_non_imm_exec
; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -737,9 +758,16 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i
; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY11]]
+ ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY15]]
; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
@@ -747,7 +775,7 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE1]], 0, 0, killed [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+ ; DAGISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE2]], 0, 0, killed [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i64 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
unreachable
}
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll
new file mode 100644
index 00000000000000..60fe7c47e0df79
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; The tail call target is known uniform, but will be in a VGPR, so we
+; need readfirstlane to legalize it.
+define void @tail_call_uniform_vgpr_value() {
+; CHECK-LABEL: tail_call_uniform_vgpr_value:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: ds_read_b64 v[0:1], v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s17, v1
+; CHECK-NEXT: v_readfirstlane_b32 s16, v0
+; CHECK-NEXT: s_setpc_b64 s[16:17]
+ %fptr = load ptr, ptr addrspace(3) null, align 8
+ tail call void %fptr()
+ ret void
+}
+
+ at constant = external hidden addrspace(4) constant ptr
+
+; readfirstlanes should fold out.
+define void @tail_call_uniform_sgpr_value() {
+; CHECK-LABEL: tail_call_uniform_sgpr_value:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, constant at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, constant at rel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[16:17]
+ %fptr = load ptr, ptr addrspace(4) @constant, align 8
+ tail call void %fptr()
+ ret void
+}
+
+define void @tail_call_uniform_vgpr_value_convergence_tokens() #0 {
+; CHECK-LABEL: tail_call_uniform_vgpr_value_convergence_tokens:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: ds_read_b64 v[0:1], v0
+; CHECK-NEXT: ; meta instruction
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s19, v1
+; CHECK-NEXT: v_readfirstlane_b32 s18, v0
+; CHECK-NEXT: s_setpc_b64 s[18:19]
+ %t = call token @llvm.experimental.convergence.entry()
+ %fptr = load ptr, ptr addrspace(3) null, align 8
+ tail call void %fptr() #0 [ "convergencectrl"(token %t) ]
+ ret void
+}
+
+attributes #0 = { convergent }
More information about the llvm-commits
mailing list