[llvm] [AMDGPU] ISel for @llvm.amdgcn.cs.chain intrinsic (PR #68186)

via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 30 06:16:53 PDT 2023


https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/68186

>From 95a06660b1bf0f172217b4d5230a87676dc72eb1 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Wed, 4 Oct 2023 09:24:45 +0200
Subject: [PATCH 1/4] [AMDGPU] ISel for @llvm.amdgcn.cs.chain intrinsic

The @llvm.amdgcn.cs.chain intrinsic is essentially a call. The call
parameters are bundled up into 2 intrinsic arguments, one for those that
should go in the SGPRs (the 3rd intrinsic argument), and one for those
that should go in the VGPRs (the 4th intrinsic argument). Both will
often be some kind of aggregate.

Both instruction selection frameworks have some internal representation
for intrinsics (G_INTRINSIC[_WITH_SIDE_EFFECTS] for GlobalISel,
ISD::INTRINSIC_[VOID|WITH_CHAIN] for DAGISel), but we can't use those
because aggregates are dissolved very early on during ISel and we'd lose
the inreg information. Therefore, this patch shortcircuits both the
IRTranslator and SelectionDAGBuilder to lower this intrinsic as a call
from the very start. It tries to use the existing infrastructure as much
as possible, by calling into the code for lowering tail calls.

This has already gone through a few rounds of review in Phab:

Differential Revision: https://reviews.llvm.org/D153761
---
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |   3 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  54 ++
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  94 ++-
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h   |   3 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  58 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |   5 +
 .../irtranslator-amdgcn-cs-chain.ll           | 129 ++++
 .../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 424 ++++++++++++
 .../AMDGPU/amdgpu-cs-chain-preserve-cc.ll     | 501 +++++++++++++-
 .../isel-amdgcn-cs-chain-intrinsic-w32.ll     | 637 +++++++++++++++++
 .../isel-amdgcn-cs-chain-intrinsic-w64.ll     | 645 ++++++++++++++++++
 11 files changed, 2537 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index d8f9e30b2599779..331e31ed6df51d5 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -62,6 +62,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
@@ -2390,6 +2391,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     Info.OrigRet = {Register(), Type::getVoidTy(CI.getContext()), 0};
     return CLI->lowerCall(MIRBuilder, Info);
   }
+  case Intrinsic::amdgcn_cs_chain:
+    return translateCallBase(CI, MIRBuilder);
   case Intrinsic::fptrunc_round: {
     uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 71f6a3791c2cee0..c87330c3788499a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -76,6 +76,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -7424,6 +7425,59 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, Val);
     return;
   }
+  case Intrinsic::amdgcn_cs_chain: {
+    assert(I.arg_size() == 5 && "Additional args not supported yet");
+    assert(cast<ConstantInt>(I.getOperand(4))->isZero() &&
+           "Non-zero flags not supported yet");
+
+    // At this point we don't care if it's amdgpu_cs_chain or
+    // amdgpu_cs_chain_preserve.
+    CallingConv::ID CC = CallingConv::AMDGPU_CS_Chain;
+
+    Type *RetTy = I.getType();
+    assert(RetTy->isVoidTy() && "Should not return");
+
+    SDValue Callee = getValue(I.getOperand(0));
+
+    // We only have 2 actual args: one for the SGPRs and one for the VGPRs.
+    TargetLowering::ArgListTy Args;
+    Args.reserve(2);
+
+    for (unsigned Idx : {2, 3}) {
+      TargetLowering::ArgListEntry Arg;
+      Arg.Node = getValue(I.getOperand(Idx));
+      Arg.Ty = I.getOperand(Idx)->getType();
+      Arg.setAttributes(&I, Idx);
+      Args.push_back(Arg);
+    }
+
+    assert(Args[0].IsInReg && "SGPR args should be marked inreg");
+    assert(!Args[1].IsInReg && "VGPR args should not be marked inreg");
+
+    // We're also going to pass the EXEC mask as the last argument.
+    TargetLowering::ArgListEntry Arg;
+    Arg.Node = getValue(I.getOperand(1));
+    Arg.Ty = I.getOperand(1)->getType();
+    Arg.IsInReg = true;
+    Args.push_back(Arg);
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(getCurSDLoc())
+        .setChain(getRoot())
+        .setCallee(CC, RetTy, Callee, std::move(Args))
+        .setNoReturn(true)
+        .setTailCall(true)
+        .setConvergent(I.isConvergent());
+    CLI.CB = &I;
+    std::pair<SDValue, SDValue> Result =
+        lowerInvokable(CLI, /*EHPadBB*/ nullptr);
+    (void)Result;
+    assert(!Result.first.getNode() && !Result.second.getNode() &&
+           "Should've lowered as tail call");
+
+    HasTailCall = true;
+    return;
+  }
   case Intrinsic::ptrmask: {
     SDValue Ptr = getValue(I.getOperand(0));
     SDValue Mask = getValue(I.getOperand(1));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index db0f56416051f01..02e1a5aa818bc8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -958,8 +958,10 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
 
 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
                               bool IsTailCall, CallingConv::ID CC) {
-  assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, "
-                                        "because the address can be divergent");
+  // For calls to amdgpu_cs_chain functions, the address is known to be uniform.
+  assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
+         "Indirect calls can't be tail calls, "
+         "because the address can be divergent");
   if (!IsTailCall)
     return AMDGPU::G_SI_CALL;
 
@@ -1150,14 +1152,20 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
 void AMDGPUCallLowering::handleImplicitCallArguments(
     MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
     const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
+    CallingConv::ID CalleeCC,
     ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
   if (!ST.enableFlatScratch()) {
     // Insert copies for the SRD. In the HSA case, this should be an identity
     // copy.
     auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
                                                FuncInfo.getScratchRSrcReg());
-    MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
-    CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
+
+    auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC)
+                             ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
+                             : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+
+    MIRBuilder.buildCopy(CalleeRSrcReg, ScratchRSrcReg);
+    CallInst.addReg(CalleeRSrcReg, RegState::Implicit);
   }
 
   for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
@@ -1253,7 +1261,8 @@ bool AMDGPUCallLowering::lowerTailCall(
   // after the ordinary user argument registers.
   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
 
-  if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
+  if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+      !AMDGPU::isChainCC(Info.CallConv)) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
       return false;
@@ -1269,7 +1278,8 @@ bool AMDGPUCallLowering::lowerTailCall(
   if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
     return false;
 
-  handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);
+  handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC,
+                              ImplicitArgRegs);
 
   // If we have -tailcallopt, we need to adjust the stack. We'll do the call
   // sequence start and end here.
@@ -1283,6 +1293,23 @@ bool AMDGPUCallLowering::lowerTailCall(
     MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0);
   }
 
+  // If this is a chain call, we need to set EXEC right before the call.
+  if (AMDGPU::isChainCC(Info.CallConv)) {
+    ArgInfo ExecArg = Info.OrigArgs[1];
+    assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");
+
+    if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize()))
+      return false;
+
+    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    MCRegister Exec = TRI->getExec();
+    auto SetExec =
+        MIRBuilder.buildInstr(MovOpc).addDef(Exec).addReg(ExecArg.Regs[0]);
+    SetExec->getOperand(1).setReg(constrainOperandRegClass(
+        MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *SetExec,
+        SetExec->getDesc(), SetExec->getOperand(1), 1));
+  }
+
   // Now we can add the actual call instruction to the correct basic block.
   MIRBuilder.insertInstr(MIB);
 
@@ -1303,8 +1330,60 @@ bool AMDGPUCallLowering::lowerTailCall(
   return true;
 }
 
+/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
+bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
+                                        CallLoweringInfo &Info) const {
+  ArgInfo Callee = Info.OrigArgs[0];
+  ArgInfo SGPRArgs = Info.OrigArgs[2];
+  ArgInfo VGPRArgs = Info.OrigArgs[3];
+  ArgInfo Flags = Info.OrigArgs[4];
+
+  assert(cast<ConstantInt>(Flags.OrigValue)->isZero() &&
+         "Non-zero flags aren't supported yet.");
+  assert(Info.OrigArgs.size() == 5 && "Additional args aren't supported yet.");
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  // The function to jump to is actually the first argument, so we'll change the
+  // Callee and other info to match that before using our existing helper.
+  const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
+  if (const Function *F = dyn_cast<Function>(CalleeV)) {
+    Info.Callee = MachineOperand::CreateGA(F, 0);
+    Info.CallConv = F->getCallingConv();
+  } else {
+    assert(Callee.Regs.size() == 1 && "Too many regs for the callee");
+    Info.Callee = MachineOperand::CreateReg(Callee.Regs[0], false);
+    Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
+                                                  // behaves the same here.
+  }
+
+  // The function that we're calling cannot be vararg (only the intrinsic is).
+  Info.IsVarArg = false;
+
+  assert(std::all_of(SGPRArgs.Flags.begin(), SGPRArgs.Flags.end(),
+                     [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
+         "SGPR arguments should be marked inreg");
+  assert(std::none_of(VGPRArgs.Flags.begin(), VGPRArgs.Flags.end(),
+                      [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
+         "VGPR arguments should not be marked inreg");
+
+  SmallVector<ArgInfo, 8> OutArgs;
+  splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv);
+  splitToValueTypes(VGPRArgs, OutArgs, DL, Info.CallConv);
+
+  Info.IsMustTailCall = true;
+  return lowerTailCall(MIRBuilder, Info, OutArgs);
+}
+
 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                    CallLoweringInfo &Info) const {
+  if (Function *F = Info.CB->getCalledFunction())
+    if (F->isIntrinsic())
+      return F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
+             lowerChainCall(MIRBuilder, Info);
+
   if (Info.IsVarArg) {
     LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
     return false;
@@ -1395,7 +1474,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
+  handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv,
+                              ImplicitArgRegs);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getStackSize();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 569c6d75204d417..a6e801f2a547b53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -75,10 +75,13 @@ class AMDGPUCallLowering final : public CallLowering {
   void handleImplicitCallArguments(
       MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
       const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI,
+      CallingConv::ID CalleeCC,
       ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const;
 
   bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
                      SmallVectorImpl<ArgInfo> &OutArgs) const;
+  bool lowerChainCall(MachineIRBuilder &MIRBuilder,
+                      CallLoweringInfo &Info) const;
   bool lowerCall(MachineIRBuilder &MIRBuilder,
                  CallLoweringInfo &Info) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c97486437ed83f1..86a11c684e6c3e6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3261,6 +3261,9 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+  if (AMDGPU::isChainCC(CalleeCC))
+    return true;
+
   if (!mayTailCallThisCC(CalleeCC))
     return false;
 
@@ -3345,7 +3348,36 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
 // The wave scratch offset register is used as the global base pointer.
 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                                     SmallVectorImpl<SDValue> &InVals) const {
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
+
   SelectionDAG &DAG = CLI.DAG;
+
+  TargetLowering::ArgListEntry RequestedExec;
+  if (IsChainCallConv) {
+    // The last argument should be the value that we need to put in EXEC.
+    // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
+    // don't treat it like the rest of the arguments.
+    RequestedExec = CLI.Args.back();
+    assert(RequestedExec.Node && "No node for EXEC");
+
+    if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
+      return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
+
+    assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
+    CLI.Outs.pop_back();
+    CLI.OutVals.pop_back();
+
+    if (RequestedExec.Ty->isIntegerTy(64)) {
+      assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
+      CLI.Outs.pop_back();
+      CLI.OutVals.pop_back();
+    }
+
+    assert(CLI.Outs.back().OrigArgIndex != 2 &&
+           "Haven't popped all the pieces of the EXEC mask");
+  }
+
   const SDLoc &DL = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
@@ -3353,7 +3385,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
   bool &IsTailCall = CLI.IsTailCall;
-  CallingConv::ID CallConv = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
   bool IsSibCall = false;
   bool IsThisReturn = false;
@@ -3384,9 +3415,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (IsTailCall) {
     IsTailCall = isEligibleForTailCallOptimization(
       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
-    if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
+    if (!IsTailCall &&
+        ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
       report_fatal_error("failed to perform tail call elimination on a call "
-                         "site marked musttail");
+                         "site marked musttail or to llvm.amdgcn.cs.chain");
     }
 
     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
@@ -3409,7 +3441,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
 
-  if (CallConv != CallingConv::AMDGPU_Gfx) {
+  if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
@@ -3435,16 +3467,20 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  if (!IsSibCall) {
+  if (!IsSibCall)
     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
 
+  if (!IsSibCall || IsChainCallConv) {
     if (!Subtarget->enableFlatScratch()) {
       SmallVector<SDValue, 4> CopyFromChains;
 
       // In the HSA case, this should be an identity copy.
       SDValue ScratchRSrcReg
         = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
-      RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+      RegsToPass.emplace_back(IsChainCallConv
+                                  ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
+                                  : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
+                              ScratchRSrcReg);
       CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
       Chain = DAG.getTokenFactor(DL, CopyFromChains);
     }
@@ -3570,6 +3606,15 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     InGlue = Chain.getValue(1);
   }
 
+  auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+
+  if (IsChainCallConv) {
+    // Set EXEC right before the call.
+    MCRegister ExecReg = TRI->getExec();
+    Chain = DAG.getCopyToReg(Chain, DL, ExecReg, RequestedExec.Node, InGlue);
+    InGlue = Chain.getValue(1);
+  }
+
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);
@@ -3598,7 +3643,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Add a register mask operand representing the call-preserved registers.
 
-  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d0a81673d6528c2..a34172f26de260b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -421,6 +421,11 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   case CallingConv::AMDGPU_Gfx:
     return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
                                : CSR_AMDGPU_SI_Gfx_RegMask;
+  case CallingConv::AMDGPU_CS_Chain:
+  case CallingConv::AMDGPU_CS_ChainPreserve:
+    // Calls to these functions never return, so we can pretend everything is
+    // preserved.
+    return AMDGPU_AllVGPRs_RegMask;
   default:
     return nullptr;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
new file mode 100644
index 000000000000000..15ebda007520e72
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+; RUN: llc --global-isel=1 -march=amdgcn -mcpu=gfx1100 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX11
+; RUN: llc --global-isel=1 -march=amdgcn -mcpu=gfx1030 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX10
+
+declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
+declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
+declare void @llvm.amdgcn.cs.chain(ptr, i32, <3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn
+
+define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GFX11-LABEL: name: chain_call
+  ; GFX11: bb.1 (%ir-block.0):
+  ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+  ; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+  ; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GFX11-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+  ; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8
+  ; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9
+  ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10
+  ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11
+  ; GFX11-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
+  ; GFX11-NEXT:   [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1
+  ; GFX11-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX11-NEXT:   [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
+  ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
+  ; GFX11-NEXT:   $sgpr0 = COPY [[UV]](s32)
+  ; GFX11-NEXT:   $sgpr1 = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   $sgpr2 = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   $vgpr8 = COPY [[COPY3]](s32)
+  ; GFX11-NEXT:   $vgpr9 = COPY [[COPY4]](p5)
+  ; GFX11-NEXT:   $vgpr10 = COPY [[COPY5]](s32)
+  ; GFX11-NEXT:   $vgpr11 = COPY [[COPY6]](s32)
+  ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[C]](s32)
+  ; GFX11-NEXT:   SI_TCRETURN [[GV1]](p0), @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GFX10-LABEL: name: chain_call
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GFX10-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11
+  ; GFX10-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
+  ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
+  ; GFX10-NEXT:   $sgpr0 = COPY [[UV]](s32)
+  ; GFX10-NEXT:   $sgpr1 = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   $sgpr2 = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   $vgpr8 = COPY [[COPY3]](s32)
+  ; GFX10-NEXT:   $vgpr9 = COPY [[COPY4]](p5)
+  ; GFX10-NEXT:   $vgpr10 = COPY [[COPY5]](s32)
+  ; GFX10-NEXT:   $vgpr11 = COPY [[COPY6]](s32)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]](<4 x s32>)
+  ; GFX10-NEXT:   $exec_lo = S_MOV_B32 [[C]](s32)
+  ; GFX10-NEXT:   SI_TCRETURN [[GV1]](p0), @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GFX11-LABEL: name: chain_preserve_call
+  ; GFX11: bb.1 (%ir-block.0):
+  ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+  ; GFX11-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+  ; GFX11-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GFX11-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+  ; GFX11-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8
+  ; GFX11-NEXT:   [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9
+  ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10
+  ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11
+  ; GFX11-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
+  ; GFX11-NEXT:   [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1
+  ; GFX11-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX11-NEXT:   [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
+  ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
+  ; GFX11-NEXT:   $sgpr0 = COPY [[UV]](s32)
+  ; GFX11-NEXT:   $sgpr1 = COPY [[UV1]](s32)
+  ; GFX11-NEXT:   $sgpr2 = COPY [[UV2]](s32)
+  ; GFX11-NEXT:   $vgpr8 = COPY [[COPY3]](s32)
+  ; GFX11-NEXT:   $vgpr9 = COPY [[COPY4]](p5)
+  ; GFX11-NEXT:   $vgpr10 = COPY [[COPY5]](s32)
+  ; GFX11-NEXT:   $vgpr11 = COPY [[COPY6]](s32)
+  ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[C]](s32)
+  ; GFX11-NEXT:   SI_TCRETURN [[GV1]](p0), @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GFX10-LABEL: name: chain_preserve_call
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GFX10-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11
+  ; GFX10-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
+  ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
+  ; GFX10-NEXT:   $sgpr0 = COPY [[UV]](s32)
+  ; GFX10-NEXT:   $sgpr1 = COPY [[UV1]](s32)
+  ; GFX10-NEXT:   $sgpr2 = COPY [[UV2]](s32)
+  ; GFX10-NEXT:   $vgpr8 = COPY [[COPY3]](s32)
+  ; GFX10-NEXT:   $vgpr9 = COPY [[COPY4]](p5)
+  ; GFX10-NEXT:   $vgpr10 = COPY [[COPY5]](s32)
+  ; GFX10-NEXT:   $vgpr11 = COPY [[COPY6]](s32)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]](<4 x s32>)
+  ; GFX10-NEXT:   $exec_lo = S_MOV_B32 [[C]](s32)
+  ; GFX10-NEXT:   SI_TCRETURN [[GV1]](p0), @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 7beaf3103586375..6600107c82e8141 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -31,6 +31,8 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_no_stack({ptr, i32, <4 x i32>} inre
   ret void
 }
 
+; FIXME: Setup s32.
+
 define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr, <4 x i32> %vgpr) {
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_simple_call:
 ; GISEL-GFX11:       ; %bb.0:
@@ -374,3 +376,425 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
   call amdgpu_gfx void @use(<24 x i32> %sgprs, <24 x i32> %vgprs)
   ret void
 }
+
+define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
+; GISEL-GFX11-LABEL: cs_to_chain:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v10, v2
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX11-NEXT:    ;;#ASMSTART
+; GISEL-GFX11-NEXT:    s_nop
+; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; GISEL-GFX10-LABEL: cs_to_chain:
+; GISEL-GFX10:       ; %bb.0:
+; GISEL-GFX10-NEXT:    s_mov_b32 s100, SCRATCH_RSRC_DWORD0
+; GISEL-GFX10-NEXT:    s_mov_b32 s101, SCRATCH_RSRC_DWORD1
+; GISEL-GFX10-NEXT:    s_mov_b32 s102, -1
+; GISEL-GFX10-NEXT:    s_mov_b32 s103, 0x31c16000
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
+; GISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX10-NEXT:    ;;#ASMSTART
+; GISEL-GFX10-NEXT:    s_nop
+; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_mov_b64 s[48:49], s[100:101]
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX11-LABEL: cs_to_chain:
+; DAGISEL-GFX11:       ; %bb.0:
+; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v10, v2
+; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX11-NEXT:    s_nop
+; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX10-LABEL: cs_to_chain:
+; DAGISEL-GFX10:       ; %bb.0:
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s100, SCRATCH_RSRC_DWORD0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s101, SCRATCH_RSRC_DWORD1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s102, -1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s103, 0x31c16000
+; DAGISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX10-NEXT:    s_nop
+; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b64 s[48:49], s[100:101]
+; DAGISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
+  call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
+; GISEL-GFX11-LABEL: chain_to_chain:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX11-NEXT:    ;;#ASMSTART
+; GISEL-GFX11-NEXT:    s_nop
+; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; GISEL-GFX10-LABEL: chain_to_chain:
+; GISEL-GFX10:       ; %bb.0:
+; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX10-NEXT:    ;;#ASMSTART
+; GISEL-GFX10-NEXT:    s_nop
+; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX11-LABEL: chain_to_chain:
+; DAGISEL-GFX11:       ; %bb.0:
+; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX11-NEXT:    s_nop
+; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX10-LABEL: chain_to_chain:
+; DAGISEL-GFX10:       ; %bb.0:
+; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX10-NEXT:    s_nop
+; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
+  call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3 x i32> %b) {
+; GISEL-GFX11-LABEL: chain_to_chain_use_all_v0_v7:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX11-NEXT:    ;;#ASMSTART
+; GISEL-GFX11-NEXT:    s_nop
+; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; GISEL-GFX10-LABEL: chain_to_chain_use_all_v0_v7:
+; GISEL-GFX10:       ; %bb.0:
+; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX10-NEXT:    ;;#ASMSTART
+; GISEL-GFX10-NEXT:    s_nop
+; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX11-LABEL: chain_to_chain_use_all_v0_v7:
+; DAGISEL-GFX11:       ; %bb.0:
+; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
+; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX11-NEXT:    s_nop
+; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX10-LABEL: chain_to_chain_use_all_v0_v7:
+; DAGISEL-GFX10:       ; %bb.0:
+; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX10-NEXT:    s_nop
+; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call void asm "s_nop", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v16},~{s0}"()
+  call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x i32> %b) {
+; GISEL-GFX11-LABEL: chain_to_chain_fewer_args:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX11-NEXT:    s_mov_b32 s2, s0
+; GISEL-GFX11-NEXT:    ;;#ASMSTART
+; GISEL-GFX11-NEXT:    s_nop
+; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s2
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GISEL-GFX11-NEXT:    s_add_u32 s2, s2, chain_callee_2 at gotpcrel32@lo+4
+; GISEL-GFX11-NEXT:    s_addc_u32 s3, s3, chain_callee_2 at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[2:3]
+;
+; GISEL-GFX10-LABEL: chain_to_chain_fewer_args:
+; GISEL-GFX10:       ; %bb.0:
+; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX10-NEXT:    s_mov_b32 s2, s0
+; GISEL-GFX10-NEXT:    ;;#ASMSTART
+; GISEL-GFX10-NEXT:    s_nop
+; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s2
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[2:3]
+; GISEL-GFX10-NEXT:    s_add_u32 s2, s2, chain_callee_2 at gotpcrel32@lo+4
+; GISEL-GFX10-NEXT:    s_addc_u32 s3, s3, chain_callee_2 at gotpcrel32@hi+12
+; GISEL-GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[2:3]
+;
+; DAGISEL-GFX11-LABEL: chain_to_chain_fewer_args:
+; DAGISEL-GFX11:       ; %bb.0:
+; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s2, s0
+; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX11-NEXT:    s_nop
+; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s2
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX10-LABEL: chain_to_chain_fewer_args:
+; DAGISEL-GFX10:       ; %bb.0:
+; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s2, s0
+; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX10-NEXT:    s_nop
+; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s2
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  %v = shufflevector <3 x i32> %b, <3 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
+  call void(ptr, i32, <2 x i32>, <2 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v2i32(ptr @chain_callee_2, i32 -1, <2 x i32> inreg %s, <2 x i32> %v, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i32> %b) {
+; GISEL-GFX11-LABEL: chain_to_chain_more_args:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX11-NEXT:    ;;#ASMSTART
+; GISEL-GFX11-NEXT:    s_nop
+; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, 0
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v11, 0
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
+; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; GISEL-GFX10-LABEL: chain_to_chain_more_args:
+; GISEL-GFX10:       ; %bb.0:
+; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX10-NEXT:    ;;#ASMSTART
+; GISEL-GFX10-NEXT:    s_nop
+; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, 0
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
+; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
+; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX11-LABEL: chain_to_chain_more_args:
+; DAGISEL-GFX11:       ; %bb.0:
+; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX11-NEXT:    s_nop
+; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, 0
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v11, 0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX10-LABEL: chain_to_chain_more_args:
+; DAGISEL-GFX10:       ; %bb.0:
+; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX10-NEXT:    s_nop
+; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, 0
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v = shufflevector <3 x i32> %b, <3 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
+  call void(ptr, i32, <4 x i32>, <4 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v4i32(ptr @chain_callee_2, i32 -1, <4 x i32> inreg %s, <4 x i32> %v, i32 0)
+  unreachable
+}
+
+declare void @llvm.amdgcn.cs.chain.v2i32(ptr, i32, <2 x i32>, <2 x i32>, i32, ...)
+declare void @llvm.amdgcn.cs.chain.v3i32(ptr, i32, <3 x i32>, <3 x i32>, i32, ...)
+declare void @llvm.amdgcn.cs.chain.v4i32(ptr, i32, <4 x i32>, <4 x i32>, i32, ...)
+declare amdgpu_cs_chain void @chain_callee_2(<2 x i32> inreg, <2 x i32>)
+declare amdgpu_cs_chain void @chain_callee(<3 x i32> inreg, <3 x i32>)
+declare amdgpu_cs_chain void @chain_callee_4(<4 x i32> inreg, <4 x i32>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index daa3d39a3afebd8..7257a68005e427f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -4,8 +4,6 @@
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
-declare amdgpu_gfx void @use(...)
-
 ; FIXME: The values of the counters are undefined on entry to amdgpu_cs_chain_preserve functions, so these waits are unnecessary.
 
 define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_no_stack({ptr, i32, <4 x i32>} inreg %a, {ptr, i32, <4 x i32>} %b) {
@@ -30,3 +28,502 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_no_stack({ptr, i3
 ; DAGISEL-GFX10-NEXT:    s_endpgm
   ret void
 }
+
+define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
+; GISEL-GFX11-LABEL: cs_to_chain_preserve:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v10, v2
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX11-NEXT:    ;;#ASMSTART
+; GISEL-GFX11-NEXT:    s_nop
+; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; GISEL-GFX10-LABEL: cs_to_chain_preserve:
+; GISEL-GFX10:       ; %bb.0:
+; GISEL-GFX10-NEXT:    s_mov_b32 s100, SCRATCH_RSRC_DWORD0
+; GISEL-GFX10-NEXT:    s_mov_b32 s101, SCRATCH_RSRC_DWORD1
+; GISEL-GFX10-NEXT:    s_mov_b32 s102, -1
+; GISEL-GFX10-NEXT:    s_mov_b32 s103, 0x31c16000
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
+; GISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX10-NEXT:    ;;#ASMSTART
+; GISEL-GFX10-NEXT:    s_nop
+; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_mov_b64 s[48:49], s[100:101]
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX11-LABEL: cs_to_chain_preserve:
+; DAGISEL-GFX11:       ; %bb.0:
+; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v10, v2
+; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX11-NEXT:    s_nop
+; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX10-LABEL: cs_to_chain_preserve:
+; DAGISEL-GFX10:       ; %bb.0:
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s100, SCRATCH_RSRC_DWORD0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s101, SCRATCH_RSRC_DWORD1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s102, -1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s103, 0x31c16000
+; DAGISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX10-NEXT:    s_nop
+; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b64 s[48:49], s[100:101]
+; DAGISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
+  call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_preserve_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
+; GISEL-GFX11-LABEL: chain_to_chain_preserve:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX11-NEXT:    ;;#ASMSTART
+; GISEL-GFX11-NEXT:    s_nop
+; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; GISEL-GFX10-LABEL: chain_to_chain_preserve:
+; GISEL-GFX10:       ; %bb.0:
+; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX10-NEXT:    ;;#ASMSTART
+; GISEL-GFX10-NEXT:    s_nop
+; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX11-LABEL: chain_to_chain_preserve:
+; DAGISEL-GFX11:       ; %bb.0:
+; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX11-NEXT:    s_nop
+; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX10-LABEL: chain_to_chain_preserve:
+; DAGISEL-GFX10:       ; %bb.0:
+; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX10-NEXT:    s_nop
+; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
+  call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_preserve_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
+  unreachable
+}
+
+; FIXME: Preserve things (i.e. v16)!
+; FIXME: Setup s32.
+
+define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
+; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX11-NEXT:    ;;#ASMSTART
+; GISEL-GFX11-NEXT:    s_nop
+; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve:
+; GISEL-GFX10:       ; %bb.0:
+; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX10-NEXT:    ;;#ASMSTART
+; GISEL-GFX10-NEXT:    s_nop
+; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve:
+; DAGISEL-GFX11:       ; %bb.0:
+; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX11-NEXT:    s_nop
+; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve:
+; DAGISEL-GFX10:       ; %bb.0:
+; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX10-NEXT:    s_nop
+; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
+  call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_preserve_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
+; GISEL-GFX11-LABEL: chain_preserve_to_chain:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX11-NEXT:    ;;#ASMSTART
+; GISEL-GFX11-NEXT:    s_nop
+; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; GISEL-GFX10-LABEL: chain_preserve_to_chain:
+; GISEL-GFX10:       ; %bb.0:
+; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX10-NEXT:    ;;#ASMSTART
+; GISEL-GFX10-NEXT:    s_nop
+; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX11-LABEL: chain_preserve_to_chain:
+; DAGISEL-GFX11:       ; %bb.0:
+; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX11-NEXT:    s_nop
+; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX10-LABEL: chain_preserve_to_chain:
+; DAGISEL-GFX10:       ; %bb.0:
+; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX10-NEXT:    s_nop
+; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
+  call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3 x i32> %b) {
+; GISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX11-NEXT:    ;;#ASMSTART
+; GISEL-GFX11-NEXT:    s_nop
+; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; GISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7:
+; GISEL-GFX10:       ; %bb.0:
+; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; GISEL-GFX10-NEXT:    ;;#ASMSTART
+; GISEL-GFX10-NEXT:    s_nop
+; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7:
+; DAGISEL-GFX11:       ; %bb.0:
+; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
+; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX11-NEXT:    s_nop
+; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7:
+; DAGISEL-GFX10:       ; %bb.0:
+; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX10-NEXT:    s_nop
+; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call void asm "s_nop", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v16},~{s0}"()
+  call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_args(<3 x i32> inreg %a, <3 x i32> %b) {
+; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX11-NEXT:    s_mov_b32 s2, s0
+; GISEL-GFX11-NEXT:    ;;#ASMSTART
+; GISEL-GFX11-NEXT:    s_nop
+; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s2
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GISEL-GFX11-NEXT:    s_add_u32 s2, s2, chain_preserve_callee_2 at gotpcrel32@lo+4
+; GISEL-GFX11-NEXT:    s_addc_u32 s3, s3, chain_preserve_callee_2 at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[2:3]
+;
+; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args:
+; GISEL-GFX10:       ; %bb.0:
+; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-GFX10-NEXT:    s_mov_b32 s2, s0
+; GISEL-GFX10-NEXT:    ;;#ASMSTART
+; GISEL-GFX10-NEXT:    s_nop
+; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s2
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[2:3]
+; GISEL-GFX10-NEXT:    s_add_u32 s2, s2, chain_preserve_callee_2 at gotpcrel32@lo+4
+; GISEL-GFX10-NEXT:    s_addc_u32 s3, s3, chain_preserve_callee_2 at gotpcrel32@hi+12
+; GISEL-GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[2:3]
+;
+; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args:
+; DAGISEL-GFX11:       ; %bb.0:
+; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee_2 at gotpcrel32@lo+4
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee_2 at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s2, s0
+; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX11-NEXT:    s_nop
+; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s2
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args:
+; DAGISEL-GFX10:       ; %bb.0:
+; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
+; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee_2 at gotpcrel32@lo+4
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee_2 at gotpcrel32@hi+12
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s2, s0
+; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
+; DAGISEL-GFX10-NEXT:    s_nop
+; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s2
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  %v = shufflevector <3 x i32> %b, <3 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
+  call void(ptr, i32, <2 x i32>, <2 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v2i32(ptr @chain_preserve_callee_2, i32 -1, <2 x i32> inreg %s, <2 x i32> %v, i32 0)
+  unreachable
+}
+
+; Note that amdgpu_cs_chain_preserve functions are not allowed to call
+; llvm.amdgcn.cs.chain with more vgpr args than they received as parameters.
+
+declare void @llvm.amdgcn.cs.chain.v3i32(ptr, i32, <3 x i32>, <3 x i32>, i32, ...)
+declare amdgpu_cs_chain_preserve void @chain_preserve_callee(<3 x i32> inreg, <3 x i32>)
+declare amdgpu_cs_chain void @chain_callee(<3 x i32> inreg, <3 x i32>)
+
+declare void @llvm.amdgcn.cs.chain.v2i32(ptr, i32, <2 x i32>, <2 x i32>, i32, ...)
+declare amdgpu_cs_chain_preserve void @chain_preserve_callee_2(<2 x i32> inreg, <2 x i32>)
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
new file mode 100644
index 000000000000000..00e6e121a279cc5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -0,0 +1,637 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+
+declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
+declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
+declare void @llvm.amdgcn.cs.chain(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn
+
+define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: chain_to_chain
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: chain_to_chain
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: chain_to_chain
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: chain_to_chain
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: cs_to_chain
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: cs_to_chain
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: cs_to_chain
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: cs_to_chain
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: chain_to_chain_preserve
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: chain_to_chain_preserve
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: chain_to_chain_preserve
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: chain_to_chain_preserve
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: cs_to_chain_preserve
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: cs_to_chain_preserve
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: cs_to_chain_preserve
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: cs_to_chain_preserve
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: indirect
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY7]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY8]]
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: indirect
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY8]]
+  ; GISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: indirect
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: indirect
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; DAGISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: non_imm_exec
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY1]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY7]]
+  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[COPY]]
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: non_imm_exec
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY8]]
+  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[COPY]]
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: non_imm_exec
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[COPY7]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: non_imm_exec
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY8]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[COPY7]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
new file mode 100644
index 000000000000000..be3269d812a62ef
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -0,0 +1,645 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+
+declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
+declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
+declare void @llvm.amdgcn.cs.chain(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn
+
+define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: chain_to_chain
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: chain_to_chain
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: chain_to_chain
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: chain_to_chain
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: cs_to_chain
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: cs_to_chain
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: cs_to_chain
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: cs_to_chain
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: chain_to_chain_preserve
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: chain_to_chain_preserve
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: chain_to_chain_preserve
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: chain_to_chain_preserve
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: cs_to_chain_preserve
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: cs_to_chain_preserve
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: cs_to_chain_preserve
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: cs_to_chain_preserve
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: indirect
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY7]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY8]]
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: indirect
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY8]]
+  ; GISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: indirect
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: indirect
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; DAGISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
+
+define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: non_imm_exec
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY2]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY7]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY8]]
+  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[REG_SEQUENCE]]
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: non_imm_exec
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY2]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY8]]
+  ; GISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
+  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[REG_SEQUENCE]]
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: non_imm_exec
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: non_imm_exec
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[REG_SEQUENCE]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}

>From 4485b78838ed1c6b31cfab86eee1a4f0f0585588 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Tue, 24 Oct 2023 10:33:28 +0200
Subject: [PATCH 2/4] fixup! [AMDGPU] ISel for @llvm.amdgcn.cs.chain intrinsic

---
 .../irtranslator-amdgcn-cs-chain.ll           |  4 +-
 .../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 40 +++++++++----------
 .../AMDGPU/amdgpu-cs-chain-preserve-cc.ll     | 40 +++++++++----------
 .../isel-amdgcn-cs-chain-intrinsic-w32.ll     |  8 ++--
 .../isel-amdgcn-cs-chain-intrinsic-w64.ll     |  8 ++--
 5 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
index 15ebda007520e72..a9e119e624f191c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc --global-isel=1 -march=amdgcn -mcpu=gfx1100 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX11
-; RUN: llc --global-isel=1 -march=amdgcn -mcpu=gfx1030 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX10
+; RUN: llc --global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX11
+; RUN: llc --global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX10
 
 declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
 declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 6600107c82e8141..8a86c82f33944c8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -1,13 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 declare amdgpu_gfx void @use(...)
 
-; FIXME: The values of the counters are undefined on entry to amdgpu_cs_chain functions, so these waits are unnecessary.
-
 define amdgpu_cs_chain void @amdgpu_cs_chain_no_stack({ptr, i32, <4 x i32>} inreg %a, {ptr, i32, <4 x i32>} %b) {
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_no_stack:
 ; GISEL-GFX11:       ; %bb.0:
@@ -398,11 +396,14 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ;
 ; GISEL-GFX10-LABEL: cs_to_chain:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_mov_b32 s100, SCRATCH_RSRC_DWORD0
-; GISEL-GFX10-NEXT:    s_mov_b32 s101, SCRATCH_RSRC_DWORD1
-; GISEL-GFX10-NEXT:    s_mov_b32 s102, -1
-; GISEL-GFX10-NEXT:    s_mov_b32 s103, 0x31c16000
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[100:101]
+; GISEL-GFX10-NEXT:    s_mov_b32 s100, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[100:103], s[100:101], 0x10
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_bitset0_b32 s103, 21
 ; GISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
 ; GISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
@@ -412,8 +413,6 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[48:49], s[100:101]
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
@@ -442,18 +441,21 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ;
 ; DAGISEL-GFX10-LABEL: cs_to_chain:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s100, SCRATCH_RSRC_DWORD0
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s101, SCRATCH_RSRC_DWORD1
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s102, -1
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s103, 0x31c16000
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[100:101]
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s100, s0
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; DAGISEL-GFX10-NEXT:    s_load_dwordx4 s[100:103], s[100:101], 0x10
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_bitset0_b32 s103, 21
 ; DAGISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
 ; DAGISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
 ; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
 ; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
@@ -461,8 +463,6 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index 7257a68005e427f..efbdce30f993071 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -1,10 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
-
-; FIXME: The values of the counters are undefined on entry to amdgpu_cs_chain_preserve functions, so these waits are unnecessary.
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_no_stack({ptr, i32, <4 x i32>} inreg %a, {ptr, i32, <4 x i32>} %b) {
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_no_stack:
@@ -50,11 +48,14 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ;
 ; GISEL-GFX10-LABEL: cs_to_chain_preserve:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_mov_b32 s100, SCRATCH_RSRC_DWORD0
-; GISEL-GFX10-NEXT:    s_mov_b32 s101, SCRATCH_RSRC_DWORD1
-; GISEL-GFX10-NEXT:    s_mov_b32 s102, -1
-; GISEL-GFX10-NEXT:    s_mov_b32 s103, 0x31c16000
+; GISEL-GFX10-NEXT:    s_getpc_b64 s[100:101]
+; GISEL-GFX10-NEXT:    s_mov_b32 s100, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[100:103], s[100:101], 0x10
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_bitset0_b32 s103, 21
 ; GISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
 ; GISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
@@ -64,8 +65,6 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[48:49], s[100:101]
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
@@ -94,18 +93,21 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ;
 ; DAGISEL-GFX10-LABEL: cs_to_chain_preserve:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s100, SCRATCH_RSRC_DWORD0
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s101, SCRATCH_RSRC_DWORD1
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s102, -1
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s103, 0x31c16000
+; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[100:101]
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s100, s0
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; DAGISEL-GFX10-NEXT:    s_load_dwordx4 s[100:103], s[100:101], 0x10
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX10-NEXT:    s_bitset0_b32 s103, 21
 ; DAGISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
 ; DAGISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
 ; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
 ; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
+; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
@@ -113,8 +115,6 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index 00e6e121a279cc5..1a7dd3d1682bbbf 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
 declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
index be3269d812a62ef..2c08aaf205710e9 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
 declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })

>From e332ec81a50c15762cbd22bc7416f47f2b25feae Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Tue, 24 Oct 2023 15:00:27 +0200
Subject: [PATCH 3/4] fixup! [AMDGPU] ISel for @llvm.amdgcn.cs.chain intrinsic

---
 .../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 241 ++++++------------
 .../AMDGPU/amdgpu-cs-chain-preserve-cc.ll     | 180 ++++---------
 .../isel-amdgcn-cs-chain-intrinsic-w32.ll     | 182 +++++++------
 .../isel-amdgcn-cs-chain-intrinsic-w64.ll     | 150 ++++++-----
 4 files changed, 331 insertions(+), 422 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 8a86c82f33944c8..8fc4c4b56776ed6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -35,26 +35,20 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_simple_call:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+12
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v4, v8 :: v_dual_mov_b32 v5, v9
-; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v6, v10 :: v_dual_mov_b32 v7, v11
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, use at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, use at abs32@hi
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: amdgpu_cs_chain_simple_call:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+12
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v4, v8
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v5, v9
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v6, v10
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v7, v11
@@ -63,34 +57,29 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, use at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, use at abs32@hi
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
 ;
 ; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_simple_call:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v7, v11 :: v_dual_mov_b32 v6, v10
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v5, v9 :: v_dual_mov_b32 v4, v8
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, use at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, use at abs32@lo
+; DAGISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; DAGISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; DAGISEL-GFX11-NEXT:    s_endpgm
 ;
 ; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_simple_call:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v7, v11
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v6, v10
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v5, v9
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v4, v8
@@ -99,8 +88,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, use at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, use at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; DAGISEL-GFX10-NEXT:    s_endpgm
   call amdgpu_gfx void @use(<4 x i32> %sgpr, <4 x i32> %vgpr)
@@ -133,6 +123,10 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v24, s24
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v25, s25
 ; GISEL-GFX11-NEXT:    s_add_u32 s24, s32, 40
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
 ; GISEL-GFX11-NEXT:    s_add_u32 s25, s32, 44
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v26, s24
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v27, s25
@@ -140,18 +134,10 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; GISEL-GFX11-NEXT:    s_add_u32 s25, s32, 52
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v28, s24
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v29, s25
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[24:25]
-; GISEL-GFX11-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
-; GISEL-GFX11-NEXT:    s_load_b64 s[24:25], s[24:25], 0x0
-; GISEL-GFX11-NEXT:    v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
-; GISEL-GFX11-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
-; GISEL-GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
-; GISEL-GFX11-NEXT:    s_add_u32 s26, s32, 56
-; GISEL-GFX11-NEXT:    s_add_u32 s27, s32, 60
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v30, s26
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v31, s27
+; GISEL-GFX11-NEXT:    s_add_u32 s24, s32, 56
+; GISEL-GFX11-NEXT:    s_add_u32 s25, s32, 60
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v30, s24
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v31, s25
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
@@ -168,18 +154,16 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v26, v34 :: v_dual_mov_b32 v27, v35
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v28, v36 :: v_dual_mov_b32 v29, v37
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v30, v38 :: v_dual_mov_b32 v31, v39
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s24, use at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s25, use at abs32@hi
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[24:25]
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: amdgpu_cs_chain_spill:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[24:25]
-; GISEL-GFX10-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+12
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[24:25], s[24:25], 0x0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v33, v9
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v34, v10
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v35, v11
@@ -236,8 +220,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v30, v38
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v31, v39
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GISEL-GFX10-NEXT:    s_mov_b32 s24, use at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s25, use at abs32@hi
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[24:25]
 ; GISEL-GFX10-NEXT:    s_endpgm
 ;
@@ -264,6 +249,10 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v24, s24
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v23, s25
 ; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 24
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v36, v11 :: v_dual_mov_b32 v37, v10
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v38, v9 :: v_dual_mov_b32 v39, v8
 ; DAGISEL-GFX11-NEXT:    s_add_i32 s25, s32, 20
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v22, s24
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v21, s25
@@ -271,18 +260,10 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; DAGISEL-GFX11-NEXT:    s_add_i32 s25, s32, 12
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v20, s24
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v19, s25
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[24:25]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+12
-; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[24:25], s[24:25], 0x0
-; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12
-; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v36, v11 :: v_dual_mov_b32 v37, v10
-; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v38, v9 :: v_dual_mov_b32 v39, v8
-; DAGISEL-GFX11-NEXT:    s_add_i32 s26, s32, 8
-; DAGISEL-GFX11-NEXT:    s_add_i32 s27, s32, 4
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v18, s26
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v17, s27
+; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 8
+; DAGISEL-GFX11-NEXT:    s_add_i32 s25, s32, 4
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v18, s24
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v17, s25
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
@@ -299,18 +280,16 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v26, v37 :: v_dual_mov_b32 v27, v36
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v28, v35 :: v_dual_mov_b32 v29, v34
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v30, v33 :: v_dual_mov_b32 v31, v32
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s25, use at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s24, use at abs32@lo
+; DAGISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; DAGISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[24:25]
 ; DAGISEL-GFX11-NEXT:    s_endpgm
 ;
 ; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_spill:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[24:25]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v32, v15
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[24:25], s[24:25], 0x0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v33, v14
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v34, v13
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v35, v12
@@ -367,8 +346,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v30, v33
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v31, v32
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s25, use at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s24, use at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[24:25]
 ; DAGISEL-GFX10-NEXT:    s_endpgm
   call amdgpu_gfx void @use(<24 x i32> %sgprs, <24 x i32> %vgprs)
@@ -384,14 +364,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: cs_to_chain:
@@ -415,28 +392,22 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: cs_to_chain:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v10, v2
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX10-LABEL: cs_to_chain:
@@ -445,17 +416,15 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s100, s0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
 ; DAGISEL-GFX10-NEXT:    s_load_dwordx4 s[100:103], s[100:101], 0x10
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
 ; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_bitset0_b32 s103, 21
 ; DAGISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
 ; DAGISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
@@ -464,7 +433,6 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
   call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
@@ -481,14 +449,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain:
@@ -502,47 +467,38 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX10-LABEL: chain_to_chain:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
   call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
@@ -559,14 +515,11 @@ define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_use_all_v0_v7:
@@ -580,47 +533,38 @@ define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_use_all_v0_v7:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX10-LABEL: chain_to_chain_use_all_v0_v7:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v16},~{s0}"()
   call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
@@ -637,14 +581,11 @@ define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s2
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GISEL-GFX11-NEXT:    s_add_u32 s2, s2, chain_callee_2 at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s3, s3, chain_callee_2 at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s2, chain_callee_2 at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, chain_callee_2 at abs32@hi
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[2:3]
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_fewer_args:
@@ -658,47 +599,38 @@ define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s2
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[2:3]
-; GISEL-GFX10-NEXT:    s_add_u32 s2, s2, chain_callee_2 at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s3, s3, chain_callee_2 at gotpcrel32@hi+12
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_mov_b32 s2, chain_callee_2 at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, chain_callee_2 at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[2:3]
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_fewer_args:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s2, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX10-LABEL: chain_to_chain_fewer_args:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s2, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
   %v = shufflevector <3 x i32> %b, <3 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
@@ -720,11 +652,9 @@ define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, 0
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v11, 0
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_more_args:
@@ -740,50 +670,41 @@ define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v11, 0
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_more_args:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, 0
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v11, 0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX10-LABEL: chain_to_chain_more_args:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee_2 at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee_2 at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, 0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, 0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %v = shufflevector <3 x i32> %b, <3 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index efbdce30f993071..cf7507ba294a9de 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -36,14 +36,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: cs_to_chain_preserve:
@@ -67,28 +64,22 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: cs_to_chain_preserve:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v10, v2
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX10-LABEL: cs_to_chain_preserve:
@@ -97,17 +88,15 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s100, s0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
 ; DAGISEL-GFX10-NEXT:    s_load_dwordx4 s[100:103], s[100:101], 0x10
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
 ; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_bitset0_b32 s103, 21
 ; DAGISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
 ; DAGISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
@@ -116,7 +105,6 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
   call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_preserve_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
@@ -133,14 +121,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i3
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_preserve:
@@ -154,47 +139,38 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i3
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_preserve:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX10-LABEL: chain_to_chain_preserve:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
   call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_preserve_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
@@ -214,14 +190,11 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve:
@@ -235,47 +208,38 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
   call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_preserve_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
@@ -292,14 +256,11 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain:
@@ -313,47 +274,38 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
   call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
@@ -370,14 +322,11 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7:
@@ -391,47 +340,38 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_callee at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_callee at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v16},~{s0}"()
   call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
@@ -448,14 +388,11 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s2
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GISEL-GFX11-NEXT:    s_add_u32 s2, s2, chain_preserve_callee_2 at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s3, s3, chain_preserve_callee_2 at gotpcrel32@hi+12
-; GISEL-GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_mov_b32 s2, chain_preserve_callee_2 at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s3, chain_preserve_callee_2 at abs32@hi
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[2:3]
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args:
@@ -469,47 +406,38 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s2
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_getpc_b64 s[2:3]
-; GISEL-GFX10-NEXT:    s_add_u32 s2, s2, chain_preserve_callee_2 at gotpcrel32@lo+4
-; GISEL-GFX10-NEXT:    s_addc_u32 s3, s3, chain_preserve_callee_2 at gotpcrel32@hi+12
-; GISEL-GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX10-NEXT:    s_mov_b32 s2, chain_preserve_callee_2 at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, chain_preserve_callee_2 at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[2:3]
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, chain_preserve_callee_2 at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee_2 at gotpcrel32@hi+12
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s2, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee_2 at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee_2 at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX10-NEXT:    s_add_u32 s4, s4, chain_preserve_callee_2 at gotpcrel32@lo+4
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s5, s5, chain_preserve_callee_2 at gotpcrel32@hi+12
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; DAGISEL-GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s2, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee_2 at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee_2 at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
   %v = shufflevector <3 x i32> %b, <3 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index 1a7dd3d1682bbbf..d7b469f60c6e11c 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -29,9 +29,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: chain_to_chain
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -55,9 +57,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: chain_to_chain
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -70,9 +74,10 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
   ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
@@ -80,8 +85,8 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: chain_to_chain
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -94,10 +99,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
-  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
@@ -106,8 +112,8 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -133,9 +139,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: cs_to_chain
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -159,9 +167,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: cs_to_chain
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -174,9 +184,10 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
   ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
@@ -184,8 +195,8 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: cs_to_chain
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -198,10 +209,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
-  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
@@ -210,8 +222,8 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -237,9 +249,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: chain_to_chain_preserve
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -263,9 +277,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: chain_to_chain_preserve
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -278,9 +294,10 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
   ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
@@ -288,8 +305,8 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: chain_to_chain_preserve
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -302,10 +319,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
-  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
-  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
@@ -314,8 +332,8 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -341,9 +359,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: cs_to_chain_preserve
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -367,9 +387,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: cs_to_chain_preserve
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -382,9 +404,10 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
   ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
@@ -392,8 +415,8 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: cs_to_chain_preserve
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -406,10 +429,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
-  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
-  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
@@ -418,8 +442,8 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -553,9 +577,11 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY7]]
   ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[COPY]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY8]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: non_imm_exec
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -579,9 +605,11 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
   ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY8]]
   ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[COPY]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY9]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: non_imm_exec
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -595,8 +623,9 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
   ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
@@ -605,7 +634,7 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[COPY7]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: non_imm_exec
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -619,8 +648,9 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY8]]
   ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
@@ -631,7 +661,7 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[COPY7]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
index 2c08aaf205710e9..93c66865014289c 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -29,9 +29,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: chain_to_chain
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -55,9 +57,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: chain_to_chain
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -70,8 +74,9 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
@@ -81,7 +86,7 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: chain_to_chain
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -94,8 +99,9 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
@@ -107,7 +113,7 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -133,9 +139,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: cs_to_chain
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -159,9 +167,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: cs_to_chain
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -174,8 +184,9 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
@@ -185,7 +196,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: cs_to_chain
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -198,8 +209,9 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; DAGISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
@@ -211,7 +223,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -237,9 +249,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: chain_to_chain_preserve
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -263,9 +277,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: chain_to_chain_preserve
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -278,8 +294,9 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
@@ -289,7 +306,7 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: chain_to_chain_preserve
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -302,8 +319,9 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
-  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
@@ -315,7 +333,7 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -341,9 +359,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: cs_to_chain_preserve
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -367,9 +387,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: cs_to_chain_preserve
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -382,8 +404,9 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
@@ -393,7 +416,7 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: cs_to_chain_preserve
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -406,8 +429,9 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc
-  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; DAGISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
   ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
@@ -419,7 +443,7 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -555,9 +579,11 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY7]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY8]]
   ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[REG_SEQUENCE]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE1]]
+  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY9]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: non_imm_exec
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -583,9 +609,11 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
   ; GISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
   ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[REG_SEQUENCE]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE1]]
+  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY10]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: non_imm_exec
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -601,8 +629,9 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
   ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
@@ -611,7 +640,7 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX11-NEXT:   $exec = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE1]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: non_imm_exec
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -627,8 +656,9 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-  ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; DAGISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
   ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
@@ -639,7 +669,7 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
   ; DAGISEL-GFX10-NEXT:   $exec = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE1]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }

>From 6119b44f971f1dfda04807430506e884a0388ac3 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Fri, 27 Oct 2023 13:27:41 +0200
Subject: [PATCH 4/4] fixup! [AMDGPU] ISel for @llvm.amdgcn.cs.chain intrinsic

Switch to pseudo
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  13 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  50 ++--
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td     |   5 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  26 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  44 ++++
 .../Target/AMDGPU/SILateBranchLowering.cpp    |  20 ++
 .../irtranslator-amdgcn-cs-chain.ll           |  20 +-
 .../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 103 ++++----
 .../AMDGPU/amdgpu-cs-chain-preserve-cc.ll     | 116 +++++----
 .../isel-amdgcn-cs-chain-intrinsic-w32.ll     | 234 ++++++++++++------
 .../isel-amdgcn-cs-chain-intrinsic-w64.ll     | 194 ++++++++++-----
 13 files changed, 522 insertions(+), 305 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c87330c3788499a..c682db5b37dd6cc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7440,10 +7440,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Callee = getValue(I.getOperand(0));
 
     // We only have 2 actual args: one for the SGPRs and one for the VGPRs.
+    // We'll also tack the value of the EXEC mask at the end.
     TargetLowering::ArgListTy Args;
-    Args.reserve(2);
+    Args.reserve(3);
 
-    for (unsigned Idx : {2, 3}) {
+    for (unsigned Idx : {2, 3, 1}) {
       TargetLowering::ArgListEntry Arg;
       Arg.Node = getValue(I.getOperand(Idx));
       Arg.Ty = I.getOperand(Idx)->getType();
@@ -7453,13 +7454,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
 
     assert(Args[0].IsInReg && "SGPR args should be marked inreg");
     assert(!Args[1].IsInReg && "VGPR args should not be marked inreg");
-
-    // We're also going to pass the EXEC mask as the last argument.
-    TargetLowering::ArgListEntry Arg;
-    Arg.Node = getValue(I.getOperand(1));
-    Arg.Ty = I.getOperand(1)->getType();
-    Arg.IsInReg = true;
-    Args.push_back(Arg);
+    Args[2].IsInReg = true; // EXEC should be inreg
 
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(getCurSDLoc())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 02e1a5aa818bc8f..9f3f61863998b50 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -957,7 +957,8 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
 }
 
 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
-                              bool IsTailCall, CallingConv::ID CC) {
+                              bool IsTailCall, bool isWave32,
+                              CallingConv::ID CC) {
   // For calls to amdgpu_cs_chain functions, the address is known to be uniform.
   assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
          "Indirect calls can't be tail calls, "
@@ -965,6 +966,9 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
   if (!IsTailCall)
     return AMDGPU::G_SI_CALL;
 
+  if (AMDGPU::isChainCC(CC))
+    return isWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
+
   return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
                                          AMDGPU::SI_TCRETURN;
 }
@@ -1197,7 +1201,8 @@ bool AMDGPUCallLowering::lowerTailCall(
   if (!IsSibCall)
     CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
 
-  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true, CalleeCC);
+  unsigned Opc =
+      getCallOpcode(MF, Info.Callee.isReg(), true, ST.isWave32(), CalleeCC);
   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
   if (!addCallTargetOperands(MIB, MIRBuilder, Info))
     return false;
@@ -1206,8 +1211,27 @@ bool AMDGPUCallLowering::lowerTailCall(
   // be 0.
   MIB.addImm(0);
 
-  // Tell the call which registers are clobbered.
+  // If this is a chain call, we need to pass in the EXEC mask.
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  if (AMDGPU::isChainCC(Info.CallConv)) {
+    ArgInfo ExecArg = Info.OrigArgs[1];
+    assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");
+
+    if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize()))
+      return false;
+
+    if (auto CI = dyn_cast<ConstantInt>(ExecArg.OrigValue)) {
+      MIB.addImm(CI->getSExtValue());
+    } else {
+      MIB.addReg(ExecArg.Regs[0]);
+      unsigned Idx = MIB->getNumOperands() - 1;
+      MIB->getOperand(Idx).setReg(constrainOperandRegClass(
+          MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
+          MIB->getDesc(), MIB->getOperand(Idx), Idx));
+    }
+  }
+
+  // Tell the call which registers are clobbered.
   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
   MIB.addRegMask(Mask);
 
@@ -1293,23 +1317,6 @@ bool AMDGPUCallLowering::lowerTailCall(
     MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0);
   }
 
-  // If this is a chain call, we need to set EXEC right before the call.
-  if (AMDGPU::isChainCC(Info.CallConv)) {
-    ArgInfo ExecArg = Info.OrigArgs[1];
-    assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");
-
-    if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize()))
-      return false;
-
-    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    MCRegister Exec = TRI->getExec();
-    auto SetExec =
-        MIRBuilder.buildInstr(MovOpc).addDef(Exec).addReg(ExecArg.Regs[0]);
-    SetExec->getOperand(1).setReg(constrainOperandRegClass(
-        MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *SetExec,
-        SetExec->getDesc(), SetExec->getOperand(1), 1));
-  }
-
   // Now we can add the actual call instruction to the correct basic block.
   MIRBuilder.insertInstr(MIB);
 
@@ -1432,7 +1439,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   // Create a temporarily-floating call instruction so we can add the implicit
   // uses of arg registers.
-  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, Info.CallConv);
+  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, ST.isWave32(),
+                               Info.CallConv);
 
   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
   MIB.addDef(TRI->getReturnAddressReg(MF));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index adf4e0139e03c1d..cba9fa3c6ebdec5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5212,6 +5212,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CALL)
   NODE_NAME_CASE(TC_RETURN)
   NODE_NAME_CASE(TC_RETURN_GFX)
+  NODE_NAME_CASE(TC_RETURN_CHAIN)
   NODE_NAME_CASE(TRAP)
   NODE_NAME_CASE(RET_GLUE)
   NODE_NAME_CASE(WAVE_ADDRESS)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index e971c85ee3f6e39..51f95ef97a3308a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -389,6 +389,7 @@ enum NodeType : unsigned {
   CALL,
   TC_RETURN,
   TC_RETURN_GFX,
+  TC_RETURN_CHAIN,
   TRAP,
 
   // Masked control flow nodes.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 2492a7be651f6d6..324285e580bbaff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -94,6 +94,11 @@ def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP,
 [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
 >;
 
+def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN",
+  SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
 def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
   SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
     [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 86a11c684e6c3e6..66d1ab0fe5cc9de 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3418,7 +3418,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     if (!IsTailCall &&
         ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
       report_fatal_error("failed to perform tail call elimination on a call "
-                         "site marked musttail or to llvm.amdgcn.cs.chain");
+                         "site marked musttail or on llvm.amdgcn.cs.chain");
     }
 
     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
@@ -3608,13 +3608,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
 
-  if (IsChainCallConv) {
-    // Set EXEC right before the call.
-    MCRegister ExecReg = TRI->getExec();
-    Chain = DAG.getCopyToReg(Chain, DL, ExecReg, RequestedExec.Node, InGlue);
-    InGlue = Chain.getValue(1);
-  }
-
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);
@@ -3634,6 +3627,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
   }
 
+  if (IsChainCallConv) {
+    Ops.push_back(RequestedExec.Node);
+  }
+
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (auto &RegToPass : RegsToPass) {
@@ -3656,8 +3653,17 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   // actual call instruction.
   if (IsTailCall) {
     MFI.setHasTailCall();
-    unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ?
-                   AMDGPUISD::TC_RETURN_GFX : AMDGPUISD::TC_RETURN;
+    unsigned OPC = AMDGPUISD::TC_RETURN;
+    switch (CallConv) {
+    case CallingConv::AMDGPU_Gfx:
+      OPC = AMDGPUISD::TC_RETURN_GFX;
+      break;
+    case CallingConv::AMDGPU_CS_Chain:
+    case CallingConv::AMDGPU_CS_ChainPreserve:
+      OPC = AMDGPUISD::TC_RETURN_CHAIN;
+      break;
+    }
+
     return DAG.getNode(OPC, DL, NodeTys, Ops);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 567f1b812c1808c..f40c56ce5188997 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -665,6 +665,50 @@ def : GCNPat<
   (SI_TCRETURN_GFX Gfx_CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff)
 >;
 
+// Pseudo for the llvm.amdgcn.cs.chain intrinsic.
+// This is essentially a tail call, but it also takes a mask to put in EXEC
+// right before jumping to the callee.
+class SI_CS_CHAIN_TC<
+    ValueType execvt,
+    RegisterOperand execrc = !if(!eq(execvt, i32), SSrc_b32, SSrc_b64)>
+    : SPseudoInstSI <(outs),
+      (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)> {
+  let FixedSize = 0;
+  let isCall = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+  let UseNamedOperandTable = 1;
+  let SchedRW = [WriteBranch];
+  let isConvergent = 1;
+
+  let WaveSizePredicate = !if(!eq(execvt, i32), isWave32, isWave64);
+}
+
+def SI_CS_CHAIN_TC_W32 : SI_CS_CHAIN_TC<i32>;
+def SI_CS_CHAIN_TC_W64 : SI_CS_CHAIN_TC<i64>;
+
+// Handle selecting direct & indirect calls via SI_CS_CHAIN_TC_W32/64
+multiclass si_cs_chain_tc_pattern<
+  dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> {
+def : GCNPat<
+  (AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec),
+  (tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
+>;
+}
+
+multiclass si_cs_chain_tc_patterns<
+  ValueType execvt,
+  RegisterOperand execrc = !if(!eq(execvt, i32), SSrc_b32, SSrc_b64),
+  Instruction tc = !if(!eq(execvt, i32), SI_CS_CHAIN_TC_W32, SI_CS_CHAIN_TC_W64)
+  > {
+  defm direct: si_cs_chain_tc_pattern<(tglobaladdr:$callee), execvt, execrc, tc>;
+  defm indirect: si_cs_chain_tc_pattern<(i64 0), execvt, execrc, tc>;
+}
+
+defm : si_cs_chain_tc_patterns<i32>;
+defm : si_cs_chain_tc_patterns<i64>;
+
 def ADJCALLSTACKUP : SPseudoInstSI<
   (outs), (ins i32imm:$amt0, i32imm:$amt1),
   [(callseq_start timm:$amt0, timm:$amt1)],
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index df522a9099c0160..21231dc53665c72 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -30,6 +30,7 @@ class SILateBranchLowering : public MachineFunctionPass {
   const SIInstrInfo *TII = nullptr;
   MachineDominatorTree *MDT = nullptr;
 
+  void expandChainCall(MachineInstr &MI);
   void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
 
 public:
@@ -116,6 +117,19 @@ static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
   MDT->getBase().applyUpdates(DTUpdates);
 }
 
+void SILateBranchLowering::expandChainCall(MachineInstr &MI) {
+  // This is a tail call that needs to be expanded into at least
+  // 2 instructions, one for setting EXEC and one for the actual tail call.
+  constexpr unsigned ExecIdx = 3;
+
+  auto SetExec =
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(MovOpc), ExecReg);
+  SetExec->addOperand(MI.getOperand(ExecIdx));
+  MI.removeOperand(ExecIdx);
+
+  MI.setDesc(TII->get(AMDGPU::SI_TCRETURN));
+}
+
 void SILateBranchLowering::earlyTerm(MachineInstr &MI,
                                      MachineBasicBlock *EarlyExitBlock) {
   MachineBasicBlock &MBB = *MI.getParent();
@@ -158,6 +172,12 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) {
         }
         break;
 
+      case AMDGPU::SI_CS_CHAIN_TC_W32:
+      case AMDGPU::SI_CS_CHAIN_TC_W64:
+        expandChainCall(MI);
+        MadeChange = true;
+        break;
+
       case AMDGPU::SI_EARLY_TERMINATE_SCC0:
         EarlyTermInstrs.push_back(&MI);
         break;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
index a9e119e624f191c..3438cbdd476d859 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
@@ -20,7 +20,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
   ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10
   ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11
   ; GFX11-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
-  ; GFX11-NEXT:   [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1
+  ; GFX11-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
   ; GFX11-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX11-NEXT:   [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
@@ -31,8 +31,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
   ; GFX11-NEXT:   $vgpr9 = COPY [[COPY4]](p5)
   ; GFX11-NEXT:   $vgpr10 = COPY [[COPY5]](s32)
   ; GFX11-NEXT:   $vgpr11 = COPY [[COPY6]](s32)
-  ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[C]](s32)
-  ; GFX11-NEXT:   SI_TCRETURN [[GV1]](p0), @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GFX11-NEXT:   SI_CS_CHAIN_TC_W32 [[GV1]](p0), @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GFX10-LABEL: name: chain_call
   ; GFX10: bb.1 (%ir-block.0):
@@ -47,7 +46,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10
   ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11
   ; GFX10-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
-  ; GFX10-NEXT:   [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
@@ -60,8 +59,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
   ; GFX10-NEXT:   $vgpr11 = COPY [[COPY6]](s32)
   ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]](<4 x s32>)
-  ; GFX10-NEXT:   $exec_lo = S_MOV_B32 [[C]](s32)
-  ; GFX10-NEXT:   SI_TCRETURN [[GV1]](p0), @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GFX10-NEXT:   SI_CS_CHAIN_TC_W32 [[GV1]](p0), @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -80,7 +78,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
   ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10
   ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11
   ; GFX11-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
-  ; GFX11-NEXT:   [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1
+  ; GFX11-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
   ; GFX11-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX11-NEXT:   [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
@@ -91,8 +89,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
   ; GFX11-NEXT:   $vgpr9 = COPY [[COPY4]](p5)
   ; GFX11-NEXT:   $vgpr10 = COPY [[COPY5]](s32)
   ; GFX11-NEXT:   $vgpr11 = COPY [[COPY6]](s32)
-  ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[C]](s32)
-  ; GFX11-NEXT:   SI_TCRETURN [[GV1]](p0), @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GFX11-NEXT:   SI_CS_CHAIN_TC_W32 [[GV1]](p0), @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GFX10-LABEL: name: chain_preserve_call
   ; GFX10: bb.1 (%ir-block.0):
@@ -107,7 +104,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10
   ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11
   ; GFX10-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
-  ; GFX10-NEXT:   [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
@@ -120,8 +117,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
   ; GFX10-NEXT:   $vgpr11 = COPY [[COPY6]](s32)
   ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]](<4 x s32>)
-  ; GFX10-NEXT:   $exec_lo = S_MOV_B32 [[C]](s32)
-  ; GFX10-NEXT:   SI_TCRETURN [[GV1]](p0), @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GFX10-NEXT:   SI_CS_CHAIN_TC_W32 [[GV1]](p0), @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 8fc4c4b56776ed6..ce8b3b3ede02734 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -363,12 +363,12 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
-; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: cs_to_chain:
@@ -379,21 +379,21 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[100:103], s[100:101], 0x10
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    s_bitset0_b32 s103, 21
 ; GISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
-; GISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[48:49], s[100:101]
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: cs_to_chain:
@@ -404,9 +404,10 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -416,22 +417,22 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s100, s0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
 ; DAGISEL-GFX10-NEXT:    s_load_dwordx4 s[100:103], s[100:101], 0x10
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_bitset0_b32 s103, 21
 ; DAGISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[48:49], s[100:101]
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
@@ -448,12 +449,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
-; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain:
@@ -464,11 +464,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain:
@@ -481,8 +481,8 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -496,8 +496,8 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
@@ -514,12 +514,11 @@ define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
-; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_use_all_v0_v7:
@@ -530,11 +529,11 @@ define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
-; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_use_all_v0_v7:
@@ -547,8 +546,8 @@ define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -562,8 +561,8 @@ define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v16},~{s0}"()
@@ -580,13 +579,12 @@ define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    s_mov_b32 s0, s2
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s2
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_mov_b32 s2, chain_callee_2 at abs32@lo
-; GISEL-GFX11-NEXT:    s_mov_b32 s3, chain_callee_2 at abs32@hi
-; GISEL-GFX11-NEXT:    s_setpc_b64 s[2:3]
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_fewer_args:
 ; GISEL-GFX10:       ; %bb.0:
@@ -596,12 +594,12 @@ define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    s_mov_b32 s0, s2
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s2
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_mov_b32 s2, chain_callee_2 at abs32@lo
-; GISEL-GFX10-NEXT:    s_mov_b32 s3, chain_callee_2 at abs32@hi
-; GISEL-GFX10-NEXT:    s_setpc_b64 s[2:3]
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_fewer_args:
 ; DAGISEL-GFX11:       ; %bb.0:
@@ -613,8 +611,8 @@ define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -628,8 +626,8 @@ define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
@@ -648,13 +646,12 @@ define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v11, 0
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, 0
-; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v11, 0
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_more_args:
@@ -665,13 +662,13 @@ define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX10-NEXT:    s_mov_b32 s3, 0
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v11, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    s_mov_b32 s3, 0
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_more_args:
@@ -684,9 +681,9 @@ define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v11, 0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, 0
-; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v11, 0
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -698,12 +695,12 @@ define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, 0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee_2 at abs32@hi
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee_2 at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, 0
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, 0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index cf7507ba294a9de..aa9f0a99e2e643b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -35,12 +35,12 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
-; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: cs_to_chain_preserve:
@@ -51,21 +51,21 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[100:103], s[100:101], 0x10
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    s_bitset0_b32 s103, 21
 ; GISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
-; GISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
+; GISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[48:49], s[100:101]
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: cs_to_chain_preserve:
@@ -76,9 +76,10 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; DAGISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -88,22 +89,22 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s100, s0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, v0
 ; DAGISEL-GFX10-NEXT:    s_load_dwordx4 s[100:103], s[100:101], 0x10
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v9, v1
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_bitset0_b32 s103, 21
 ; DAGISEL-GFX10-NEXT:    s_add_u32 s100, s100, s3
-; DAGISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
+; DAGISEL-GFX10-NEXT:    s_addc_u32 s101, s101, 0
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[48:49], s[100:101]
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[50:51], s[102:103]
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
@@ -120,12 +121,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i3
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
-; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_preserve:
@@ -136,11 +136,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i3
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_preserve:
@@ -153,8 +153,8 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i3
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -168,8 +168,8 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i3
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
@@ -189,12 +189,11 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
-; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve:
@@ -205,11 +204,11 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve:
@@ -222,8 +221,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -237,8 +236,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee at abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee at abs32@lo
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
@@ -255,12 +254,11 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
-; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain:
@@ -271,11 +269,11 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain:
@@ -288,8 +286,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -303,8 +301,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"()
@@ -321,12 +319,11 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
-; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7:
@@ -337,11 +334,11 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
-; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7:
@@ -354,8 +351,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -369,8 +366,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee at abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee at abs32@lo
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   call void asm "s_nop", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v16},~{s0}"()
@@ -387,13 +384,12 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    s_mov_b32 s0, s2
-; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee_2 at abs32@lo
+; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee_2 at abs32@hi
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX11-NEXT:    s_mov_b32 s0, s2
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX11-NEXT:    s_mov_b32 s2, chain_preserve_callee_2 at abs32@lo
-; GISEL-GFX11-NEXT:    s_mov_b32 s3, chain_preserve_callee_2 at abs32@hi
-; GISEL-GFX11-NEXT:    s_setpc_b64 s[2:3]
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; GISEL-GFX10:       ; %bb.0:
@@ -403,12 +399,12 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    s_mov_b32 s0, s2
+; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee_2 at abs32@lo
+; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee_2 at abs32@hi
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GISEL-GFX10-NEXT:    s_mov_b32 s0, s2
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-GFX10-NEXT:    s_mov_b32 s2, chain_preserve_callee_2 at abs32@lo
-; GISEL-GFX10-NEXT:    s_mov_b32 s3, chain_preserve_callee_2 at abs32@hi
-; GISEL-GFX10-NEXT:    s_setpc_b64 s[2:3]
+; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; DAGISEL-GFX11:       ; %bb.0:
@@ -420,8 +416,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee_2 at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee_2 at abs32@lo
-; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
 ;
@@ -435,8 +431,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee_2 at abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee_2 at abs32@lo
-; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s2
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
   %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index d7b469f60c6e11c..1fd0c67737a27af 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -27,13 +27,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
-  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 [[COPY7]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: chain_to_chain
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -55,13 +53,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
-  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 [[COPY8]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: chain_to_chain
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -85,8 +81,7 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], @callee, 0, killed [[S_MOV_B32_2]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: chain_to_chain
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -112,8 +107,7 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], @callee, 0, killed [[S_MOV_B32_2]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -137,13 +131,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
-  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 [[COPY7]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: cs_to_chain
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -165,13 +157,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
-  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 [[COPY8]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: cs_to_chain
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -195,8 +185,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], @callee, 0, killed [[S_MOV_B32_2]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: cs_to_chain
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -222,8 +211,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], @callee, 0, killed [[S_MOV_B32_2]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -247,13 +235,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
-  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 [[COPY7]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: chain_to_chain_preserve
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -275,13 +261,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
-  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 [[COPY8]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: chain_to_chain_preserve
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -305,8 +289,7 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], @callee_preserve, 0, killed [[S_MOV_B32_2]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: chain_to_chain_preserve
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -332,8 +315,7 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], @callee_preserve, 0, killed [[S_MOV_B32_2]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -357,13 +339,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
-  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 [[COPY7]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: cs_to_chain_preserve
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -385,13 +365,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
-  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 [[COPY8]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: cs_to_chain_preserve
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -415,8 +393,7 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], @callee_preserve, 0, killed [[S_MOV_B32_2]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: cs_to_chain_preserve
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -442,8 +419,7 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_2]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], @callee_preserve, 0, killed [[S_MOV_B32_2]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -470,9 +446,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY7]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY8]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 [[REG_SEQUENCE]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: indirect
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -497,9 +471,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY8]]
   ; GISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 [[REG_SEQUENCE]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: indirect
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -523,8 +495,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], 0, 0, killed [[S_MOV_B32_]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: indirect
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -550,8 +521,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[S_MOV_B32_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], 0, 0, killed [[S_MOV_B32_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -576,12 +546,11 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY5]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY7]]
-  ; GISEL-GFX11-NEXT:   $exec_lo = S_MOV_B32 [[COPY]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
   ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY8]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 [[COPY8]], @callee, 0, [[COPY]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: non_imm_exec
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -604,12 +573,11 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY7]]
   ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY8]]
-  ; GISEL-GFX10-NEXT:   $exec_lo = S_MOV_B32 [[COPY]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
   ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY9]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 [[COPY9]], @callee, 0, [[COPY]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: non_imm_exec
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -633,8 +601,7 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec_lo = COPY [[COPY7]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], @callee, 0, [[COPY7]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: non_imm_exec
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -660,8 +627,111 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec_lo = COPY [[COPY7]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], @callee, 0, [[COPY7]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
+
+define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 inreg %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: indirect_with_non_imm_exec
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX11-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY3]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY7]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY8]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY9]]
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 [[REG_SEQUENCE]], 0, 0, [[COPY2]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: indirect_with_non_imm_exec
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY3]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY8]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY9]]
+  ; GISEL-GFX10-NEXT:   [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]]
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 [[REG_SEQUENCE]], 0, 0, [[COPY2]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: indirect_with_non_imm_exec
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], 0, 0, [[COPY7]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: indirect_with_non_imm_exec
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
+  ; DAGISEL-GFX10-NEXT:   [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE]], 0, 0, [[COPY7]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
index 93c66865014289c..fa447f981bf3a37 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -27,13 +27,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
   ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 [[COPY7]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: chain_to_chain
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -55,13 +53,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
   ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 [[COPY8]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: chain_to_chain
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -85,8 +81,7 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], @callee, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: chain_to_chain
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -112,8 +107,7 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], @callee, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -137,13 +131,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
   ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 [[COPY7]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: cs_to_chain
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -165,13 +157,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
   ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 [[COPY8]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: cs_to_chain
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -195,8 +185,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], @callee, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: cs_to_chain
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -222,8 +211,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], @callee, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -247,13 +235,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
   ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 [[COPY7]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: chain_to_chain_preserve
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -275,13 +261,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
   ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 [[COPY8]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: chain_to_chain_preserve
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -305,8 +289,7 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], @callee_preserve, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: chain_to_chain_preserve
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -332,8 +315,7 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], @callee_preserve, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -357,13 +339,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY4]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY5]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY6]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
   ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY7]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 [[COPY7]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: cs_to_chain_preserve
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -385,13 +365,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY6]]
   ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
   ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY8]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 [[COPY8]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: cs_to_chain_preserve
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -415,8 +393,7 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], @callee_preserve, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: cs_to_chain_preserve
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -442,8 +419,7 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], @callee_preserve, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -470,9 +446,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY7]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY8]]
-  ; GISEL-GFX11-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 [[REG_SEQUENCE]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: indirect
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -497,9 +471,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY8]]
   ; GISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
-  ; GISEL-GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[S_MOV_B64_]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 [[REG_SEQUENCE]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: indirect
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -523,8 +495,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], 0, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: indirect
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -550,8 +521,7 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr,
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[S_MOV_B64_]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE]], 0, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
@@ -578,12 +548,11 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY7]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY8]]
-  ; GISEL-GFX11-NEXT:   $exec = S_MOV_B64 [[REG_SEQUENCE]]
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
   ; GISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
   ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX11-NEXT:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE1]]
-  ; GISEL-GFX11-NEXT:   SI_TCRETURN [[COPY9]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 [[COPY9]], @callee, 0, [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; GISEL-GFX10-LABEL: name: non_imm_exec
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -608,12 +577,11 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY8]]
   ; GISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]]
-  ; GISEL-GFX10-NEXT:   $exec = S_MOV_B64 [[REG_SEQUENCE]]
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
   ; GISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
   ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
   ; GISEL-GFX10-NEXT:   [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE1]]
-  ; GISEL-GFX10-NEXT:   SI_TCRETURN [[COPY10]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 [[COPY10]], @callee, 0, [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
   ;
   ; DAGISEL-GFX11-LABEL: name: non_imm_exec
   ; DAGISEL-GFX11: bb.0 (%ir-block.0):
@@ -639,8 +607,7 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX11-NEXT:   $exec = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX11-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE1]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE1]], @callee, 0, killed [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ;
   ; DAGISEL-GFX10-LABEL: name: non_imm_exec
   ; DAGISEL-GFX10: bb.0 (%ir-block.0):
@@ -668,8 +635,119 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr
   ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
   ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
   ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
-  ; DAGISEL-GFX10-NEXT:   $exec = COPY [[REG_SEQUENCE]]
-  ; DAGISEL-GFX10-NEXT:   SI_TCRETURN killed [[REG_SEQUENCE1]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE1]], @callee, 0, killed [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
   unreachable
 }
+
+define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 inreg %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+  ; GISEL-GFX11-LABEL: name: indirect_with_non_imm_exec
+  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX11-NEXT: {{  $}}
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX11-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX11-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY4]]
+  ; GISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; GISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY6]]
+  ; GISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY7]]
+  ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY8]]
+  ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY9]]
+  ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY10]]
+  ; GISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 [[REG_SEQUENCE]], 0, 0, [[REG_SEQUENCE1]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; GISEL-GFX10-LABEL: name: indirect_with_non_imm_exec
+  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; GISEL-GFX10-NEXT: {{  $}}
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GISEL-GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; GISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; GISEL-GFX10-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; GISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY4]]
+  ; GISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; GISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY6]]
+  ; GISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY7]]
+  ; GISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY8]]
+  ; GISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY9]]
+  ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY10]]
+  ; GISEL-GFX10-NEXT:   [[COPY11:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; GISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY11]]
+  ; GISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 [[REG_SEQUENCE]], 0, 0, [[REG_SEQUENCE1]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+  ;
+  ; DAGISEL-GFX11-LABEL: name: indirect_with_non_imm_exec
+  ; DAGISEL-GFX11: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX11-NEXT: {{  $}}
+  ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+  ; DAGISEL-GFX11-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX11-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX11-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX11-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; DAGISEL-GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; DAGISEL-GFX11-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX11-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX11-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE1]], 0, 0, killed [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  ;
+  ; DAGISEL-GFX10-LABEL: name: indirect_with_non_imm_exec
+  ; DAGISEL-GFX10: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+  ; DAGISEL-GFX10-NEXT: {{  $}}
+  ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; DAGISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX10-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+  ; DAGISEL-GFX10-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; DAGISEL-GFX10-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; DAGISEL-GFX10-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; DAGISEL-GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; DAGISEL-GFX10-NEXT:   [[COPY11:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
+  ; DAGISEL-GFX10-NEXT:   $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY11]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr1 = COPY [[COPY5]]
+  ; DAGISEL-GFX10-NEXT:   $sgpr2 = COPY [[COPY4]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr8 = COPY [[COPY3]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr9 = COPY [[COPY2]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr10 = COPY [[COPY1]]
+  ; DAGISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY]]
+  ; DAGISEL-GFX10-NEXT:   SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE1]], 0, 0, killed [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
+  call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i64 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0)
+  unreachable
+}



More information about the llvm-commits mailing list