[llvm-branch-commits] [llvm] AMDGPU: Do not tail call if an inreg argument requires waterfalling (PR #111002)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Oct 3 08:08:08 PDT 2024
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/111002
If we have a divergent value passed to an outgoing inreg argument,
the call needs to be executed in a waterfall loop and thus cannot
be tail called.
The waterfall handling of arbitrary calls is broken on the selectiondag
path, so some of these cases still hit an error later.
I also noticed the argument evaluation code in isEligibleForTailCallOptimization
is not correctly accounting for implicit argument assignments. It also seems
inreg codegen is generally broken; we are assigning arguments to the reserved
private resource descriptor.
>From 7e7685b87e0fc00f9d329f3402885e5e01c03672 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 3 Oct 2024 16:06:49 +0400
Subject: [PATCH] AMDGPU: Do not tail call if an inreg argument requires
waterfalling
If we have a divergent value passed to an outgoing inreg argument,
the call needs to be executed in a waterfall loop and thus cannot
be tail called.
The waterfall handling of arbitrary calls is broken on the selectiondag
path, so some of these cases still hit an error later.
I also noticed the argument evaluation code in isEligibleForTailCallOptimization
is not correctly accounting for implicit argument assignments. It also seems
inreg codegen is generally broken; we are assigning arguments to the reserved
private resource descriptor.
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 3 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 59 +++++-
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 3 +
.../isel-amdgcn-cs-chain-intrinsic-w32.ll | 196 +++++++++++++-----
.../isel-amdgcn-cs-chain-intrinsic-w64.ll | 196 +++++++++++++-----
.../AMDGPU/tail-call-inreg-arguments.error.ll | 78 +++++++
.../AMDGPU/tail-call-inreg-arguments.ll | 97 +++++++++
7 files changed, 510 insertions(+), 122 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 25e36dc4b3691f..2cde47c743f9e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1142,6 +1142,9 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
return false;
}
+ // FIXME: We need to check if any arguments passed in SGPR are uniform. If
+ // they are not, this cannot be a tail call. If they are uniform, but may be
+ // VGPR, we need to insert readfirstlanes.
if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 67e5b3de741412..53cb0800f7fd27 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3593,6 +3593,8 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
+ // FIXME: We are not allocating special input registers, so we will be
+ // deciding based on incorrect register assignments.
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -3602,6 +3604,21 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
return false;
+ for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
+ // FIXME: What about inreg arguments that end up passed in memory?
+ if (!CCVA.isRegLoc())
+ continue;
+
+ // If we are passing an argument in an SGPR, and the value is divergent,
+ // this call requires a waterfall loop.
+ if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
+ LLVM_DEBUG(
+ dbgs() << "Cannot tail call due to divergent outgoing argument in "
+ << printReg(CCVA.getLocReg(), TRI) << '\n');
+ return false;
+ }
+ }
+
const MachineRegisterInfo &MRI = MF.getRegInfo();
return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
}
@@ -3734,6 +3751,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// arguments to begin at SP+0. Completely unused for non-tail calls.
int32_t FPDiff = 0;
MachineFrameInfo &MFI = MF.getFrameInfo();
+ auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
@@ -3756,6 +3774,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
}
+ const unsigned NumSpecialInputs = RegsToPass.size();
+
MVT PtrVT = MVT::i32;
// Walk the register/memloc assignments, inserting copies/loads.
@@ -3857,16 +3877,40 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+ SDValue ReadFirstLaneID =
+ DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
+
+ SDValue TokenGlue;
+ if (CLI.ConvergenceControlToken) {
+ TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
+ CLI.ConvergenceControlToken);
+ }
+
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InGlue;
- for (auto &RegToPass : RegsToPass) {
- Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
- RegToPass.second, InGlue);
+
+ unsigned ArgIdx = 0;
+ for (auto [Reg, Val] : RegsToPass) {
+ if (ArgIdx++ >= NumSpecialInputs && !Val->isDivergent() &&
+ TRI->isSGPRPhysReg(Reg)) {
+ // Speculatively insert a readfirstlane in case this is a uniform value in
+ // a VGPR.
+ //
+ // FIXME: We need to execute this in a waterfall loop if it is a divergent
+ // value, so let that continue to produce invalid code.
+
+ SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
+ if (TokenGlue)
+ ReadfirstlaneArgs.push_back(TokenGlue);
+ Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(),
+ ReadfirstlaneArgs);
+ }
+
+ Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
InGlue = Chain.getValue(1);
}
-
// We don't usually want to end the call-sequence here because we would tidy
// the frame up *after* the call, however in the ABI-changing tail-call case
// we've carefully laid out the parameters so that when sp is reset they'll be
@@ -3896,12 +3940,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
- if (CLI.ConvergenceControlToken) {
- SDValue TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {},
- MVT::Glue, CLI.ConvergenceControlToken);
+ if (TokenGlue)
ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
- }
-
Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
ReadfirstlaneArgs);
}
@@ -3928,7 +3968,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
// Add a register mask operand representing the call-preserved registers.
- auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 409e5418abc8ec..99fa632c0300be 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -210,6 +210,9 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
}
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const;
+ bool isSGPRPhysReg(Register Reg) const {
+ return isSGPRClass(getPhysRegBaseClass(Reg));
+ }
/// \returns true if this class contains only VGPR registers
static bool isVGPRClass(const TargetRegisterClass *RC) {
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index 84dce0ffb31e8f..469d0453b9dfb1 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -73,10 +73,16 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -97,12 +103,18 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -177,10 +189,16 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -201,12 +219,18 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+ ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -281,10 +305,16 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -305,12 +335,18 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -385,10 +421,16 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -409,12 +451,18 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+ ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -494,10 +542,16 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_4]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -525,12 +579,18 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY13]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY16]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_4]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -608,9 +668,15 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -632,11 +698,17 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY8]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY11]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -719,9 +791,15 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i
; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_4]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -750,11 +828,17 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i
; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY14]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY17]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_4]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
index 72b2654257a936..51c28a02b7f821 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -73,10 +73,16 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -97,12 +103,18 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -177,10 +189,16 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -201,12 +219,18 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+ ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -281,10 +305,16 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -305,12 +335,18 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -385,10 +421,16 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -409,12 +451,18 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+ ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -494,10 +542,16 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_4]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -525,12 +579,18 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY13]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY16]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_4]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -614,9 +674,15 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -640,11 +706,17 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY12]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -733,9 +805,15 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i
; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
- ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec
+ ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_4]]
; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
@@ -766,11 +844,17 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i
; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
- ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
- ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY15]]
- ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]]
- ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]]
- ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec
+ ; DAGISEL-GFX10-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+ ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY18]]
+ ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_4]]
; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]]
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
new file mode 100644
index 00000000000000..cd6cb4d1e9fe45
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 2> %t.err < %s | FileCheck %s
+; RUN: FileCheck -check-prefix=ERR %s < %t.err
+; FIXME: These tests cannot be tail called, and should be executed in a waterfall loop.
+
+declare hidden void @void_func_i32_inreg(i32 inreg)
+
+; ERR: error: <unknown>:0:0: in function tail_call_i32_inreg_divergent void (i32): illegal VGPR to SGPR copy
+; ERR: error: <unknown>:0:0: in function indirect_tail_call_i32_inreg_divergent void (i32): illegal VGPR to SGPR copy
+
+define void @tail_call_i32_inreg_divergent(i32 %vgpr) {
+; CHECK-LABEL: tail_call_i32_inreg_divergent:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s16, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[18:19]
+; CHECK-NEXT: v_writelane_b32 v40, s16, 2
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, void_func_i32_inreg at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i32_inreg at rel32@hi+12
+; CHECK-NEXT: ; illegal copy v0 to s0
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s4, v40, 2
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_addk_i32 s32, 0xfc00
+; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ tail call void @void_func_i32_inreg(i32 inreg %vgpr)
+ ret void
+}
+
+ at constant = external hidden addrspace(4) constant ptr
+
+define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) {
+; CHECK-LABEL: indirect_tail_call_i32_inreg_divergent:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s16, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[18:19]
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v40, s16, 2
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, constant at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, constant at rel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: ; illegal copy v0 to s0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s4, v40, 2
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_addk_i32 s32, 0xfc00
+; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %fptr = load ptr, ptr addrspace(4) @constant, align 8
+ tail call void %fptr(i32 inreg %vgpr)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
new file mode 100644
index 00000000000000..2db62111745f23
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+declare hidden void @void_func_i32_inreg(i32 inreg)
+
+define void @tail_call_i32_inreg_uniform(i32 inreg %sgpr) {
+; CHECK-LABEL: tail_call_i32_inreg_uniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s0, s6
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, void_func_i32_inreg at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i32_inreg at rel32@hi+12
+; CHECK-NEXT: s_setpc_b64 s[16:17]
+ tail call void @void_func_i32_inreg(i32 inreg %sgpr)
+ ret void
+}
+
+ at constant = external hidden addrspace(4) constant ptr
+
+define void @indirect_tail_call_i32_inreg_uniform(i32 inreg %sgpr) {
+; CHECK-LABEL: indirect_tail_call_i32_inreg_uniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, constant at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, constant at rel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7
+; CHECK-NEXT: s_mov_b32 s0, s6
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[16:17]
+ %fptr = load ptr, ptr addrspace(4) @constant, align 8
+ tail call void %fptr(i32 inreg %sgpr)
+ ret void
+}
+
+declare void @void_func_i64_inreg(i64 inreg)
+
+define void @tail_call_i64_inreg_uniform(i64 inreg %sgpr) {
+; CHECK-LABEL: tail_call_i64_inreg_uniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, void_func_i64_inreg at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i64_inreg at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT: s_mov_b32 s1, s7
+; CHECK-NEXT: s_mov_b32 s0, s6
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[16:17]
+ tail call void @void_func_i64_inreg(i64 inreg %sgpr)
+ ret void
+}
+
+define void @tail_call_i64_inreg_uniform_in_vgpr() {
+; CHECK-LABEL: tail_call_i64_inreg_uniform_in_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: ds_read_b64 v[0:1], v0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, void_func_i64_inreg at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i64_inreg at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
+; CHECK-NEXT: s_setpc_b64 s[16:17]
+ %uniform.vgpr = load i64, ptr addrspace(3) null, align 8
+ tail call void @void_func_i64_inreg(i64 inreg %uniform.vgpr)
+ ret void
+}
+
+define void @tail_call_i64_inreg_uniform_in_vgpr_convergence_tokens() #0 {
+; CHECK-LABEL: tail_call_i64_inreg_uniform_in_vgpr_convergence_tokens:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: ds_read_b64 v[0:1], v0
+; CHECK-NEXT: s_getpc_b64 s[18:19]
+; CHECK-NEXT: s_add_u32 s18, s18, void_func_i64_inreg at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s19, s19, void_func_i64_inreg at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0
+; CHECK-NEXT: ; meta instruction
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
+; CHECK-NEXT: s_setpc_b64 s[18:19]
+ %t = call token @llvm.experimental.convergence.entry()
+ %uniform.vgpr = load i64, ptr addrspace(3) null, align 8
+ tail call void @void_func_i64_inreg(i64 inreg %uniform.vgpr) #0 [ "convergencectrl"(token %t) ]
+ ret void
+}
+
+attributes #0 = { convergent }
More information about the llvm-branch-commits
mailing list