[llvm] [AMDGPU] Fix @llvm.amdgcn.cs.chain with SGPR args not provably uniform (PR #114232)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 30 06:48:55 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
The correct behaviour is to insert a readfirstlane. SelectionDAG was
already doing this in some cases, but not in the general case for chain
calls. GlobalISel was already doing this for return values but not for
arguments.
---
Patch is 96.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114232.diff
8 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp (-7)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+7-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll (+24-12)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll (+6-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll (+66-30)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll (+73)
- (modified) llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll (+160-76)
- (modified) llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll (+160-76)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 351e9f25e29cfc..ab62e530a18d0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -230,13 +230,6 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
return AddrReg.getReg(0);
}
- void assignValueToReg(Register ValVReg, Register PhysReg,
- const CCValAssign &VA) override {
- MIB.addUse(PhysReg, RegState::Implicit);
- Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
- MIRBuilder.buildCopy(PhysReg, ExtReg);
- }
-
void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
const MachinePointerInfo &MPO,
const CCValAssign &VA) override {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 52ca38aca5c771..bc2b1bb4525fd0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3855,10 +3855,14 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
unsigned ArgIdx = 0;
for (auto [Reg, Val] : RegsToPass) {
- if (ArgIdx++ >= NumSpecialInputs && !Val->isDivergent() &&
+ if (ArgIdx++ >= NumSpecialInputs && (IsChainCallConv || !Val->isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
- // Speculatively insert a readfirstlane in case this is a uniform value in
- // a VGPR.
+ // For chain calls, the inreg arguments are required to be
+ // uniform. Speculatively Insert a readfirstlane in case we cannot prove
+ // they are uniform.
+ //
+ // For other calls, if an inreg arguments is known to be uniform,
+ // speculatively insert a readfirstlane in case it is in a VGPR.
//
// FIXME: We need to execute this in a waterfall loop if it is a divergent
// value, so let that continue to produce invalid code.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
index 3438cbdd476d85..4b0ff1b2eb4704 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
@@ -24,9 +24,12 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
- ; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; GFX11-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; GFX11-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; GFX11-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; GFX11-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32)
; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5)
; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32)
@@ -50,9 +53,12 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
- ; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; GFX10-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; GFX10-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32)
; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5)
; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32)
@@ -82,9 +88,12 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
- ; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32)
- ; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32)
+ ; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; GFX11-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; GFX11-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; GFX11-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; GFX11-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32)
; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5)
; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32)
@@ -108,9 +117,12 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
- ; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32)
- ; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; GFX10-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+ ; GFX10-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32)
; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5)
; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
index 5effd24a752088..adad38de380d7d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
@@ -50,7 +50,8 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
- ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32)
+ ; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
@@ -99,8 +100,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
- ; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32)
- ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT1]](s32)
+ ; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[LOAD2]](s32)
+ ; CHECK-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index c3694158e7b971..96c3575e3190c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -942,7 +942,8 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
- ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32)
+ ; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
@@ -3984,8 +3985,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
- ; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32)
- ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT1]](s32)
+ ; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[LOAD2]](s32)
+ ; CHECK-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
@@ -5309,7 +5312,8 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
- ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5354,7 +5358,8 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY9]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5402,8 +5407,10 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5451,8 +5458,10 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5499,7 +5508,8 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
- ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5546,7 +5556,8 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
- ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5591,7 +5602,8 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY9]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5639,8 +5651,10 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5685,7 +5699,9 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY9]](<2 x s16>)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5738,8 +5754,12 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[DEF]](s16)
; CHECK-NEXT: [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s16>)
- ; CHECK-NEXT: $sgpr0 = COPY [[UV7]](<2 x s16>)
- ; CHECK-NEXT: $sgpr1 = COPY [[UV8]](<2 x s16>)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/114232
More information about the llvm-commits
mailing list