[llvm] 09b5329 - Revert "[AMDGPU] Move call clobbered return address registers s[30:31] to callee saved range"

Ron Lieberman via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 22 08:39:57 PST 2021


Author: Ron Lieberman
Date: 2021-12-22T11:39:28-05:00
New Revision: 09b53296cf1649d6f953e71d1c3cd970ad74dde8

URL: https://github.com/llvm/llvm-project/commit/09b53296cf1649d6f953e71d1c3cd970ad74dde8
DIFF: https://github.com/llvm/llvm-project/commit/09b53296cf1649d6f953e71d1c3cd970ad74dde8.diff

LOG: Revert "[AMDGPU] Move call clobbered return address registers s[30:31] to callee saved range"

This reverts commit 9075009d1fd5f2bf9aa6c2f362d2993691a316b3.

 Failed amdgpu runtime buildbot # 3514

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
    llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
    llvm/lib/Target/AMDGPU/SIRegisterInfo.td
    llvm/lib/Target/AMDGPU/SOPInstructions.td
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-illegal-types.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fixed-function-abi-vgpr-args.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
    llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
    llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
    llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
    llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
    llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
    llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
    llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
    llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll
    llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
    llvm/test/CodeGen/AMDGPU/fpow.ll
    llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
    llvm/test/CodeGen/AMDGPU/indirect-call.ll
    llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
    llvm/test/CodeGen/AMDGPU/ipra.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
    llvm/test/CodeGen/AMDGPU/llvm.powi.ll
    llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
    llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/nested-calls.ll
    llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir
    llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
    llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
    llvm/test/CodeGen/AMDGPU/save-fp.ll
    llvm/test/CodeGen/AMDGPU/sibling-call.ll
    llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
    llvm/test/CodeGen/AMDGPU/stack-realign.ll
    llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
    llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
    llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
    llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
    llvm/test/CodeGen/AMDGPU/udiv.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
    llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
    llvm/test/CodeGen/AMDGPU/wave32.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-metadata.mir
    llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 8041e2993c118..2f1e7823f65cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -337,6 +337,7 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
                                      FunctionLoweringInfo &FLI) const {
 
   MachineFunction &MF = B.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   MFI->setIfReturnsVoid(!Val);
 
@@ -352,15 +353,40 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
     return true;
   }
 
-  unsigned ReturnOpc =
-      IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
+  auto const &ST = MF.getSubtarget<GCNSubtarget>();
+
+  unsigned ReturnOpc = 0;
+  if (IsShader)
+    ReturnOpc = AMDGPU::SI_RETURN_TO_EPILOG;
+  else if (CC == CallingConv::AMDGPU_Gfx)
+    ReturnOpc = AMDGPU::S_SETPC_B64_return_gfx;
+  else
+    ReturnOpc = AMDGPU::S_SETPC_B64_return;
+
   auto Ret = B.buildInstrNoInsert(ReturnOpc);
+  Register ReturnAddrVReg;
+  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
+    ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
+    Ret.addUse(ReturnAddrVReg);
+  } else if (ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
+    ReturnAddrVReg =
+        MRI.createVirtualRegister(&AMDGPU::Gfx_CCR_SGPR_64RegClass);
+    Ret.addUse(ReturnAddrVReg);
+  }
 
   if (!FLI.CanLowerReturn)
     insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
   else if (!lowerReturnVal(B, Val, VRegs, Ret))
     return false;
 
+  if (ReturnOpc == AMDGPU::S_SETPC_B64_return ||
+      ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
+                                         &AMDGPU::SGPR_64RegClass);
+    B.buildCopy(ReturnAddrVReg, LiveInReturn);
+  }
+
   // TODO: Handle CalleeSavedRegsViaCopy.
 
   B.insertInstr(Ret);
@@ -575,6 +601,14 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
 
+  if (!IsEntryFunc) {
+    Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
+    Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
+                                         &AMDGPU::SGPR_64RegClass);
+    MBB.addLiveIn(ReturnAddrReg);
+    B.buildCopy(LiveInReturn, ReturnAddrReg);
+  }
+
   if (Info->hasImplicitBufferPtr()) {
     Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index dec227777d8f5..1682d43ae671a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -152,10 +152,6 @@ def CSR_AMDGPU_AGPRs_32_255 : CalleeSavedRegs<
   (sequence "AGPR%u", 32, 255)
 >;
 
-def CSR_AMDGPU_SGPRs_30_31 : CalleeSavedRegs<
-  (sequence "SGPR%u", 30, 31)
->;
-
 def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
   (sequence "SGPR%u", 32, 105)
 >;
@@ -186,7 +182,7 @@ def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
 >;
 
 def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
-  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_30_31, CSR_AMDGPU_SGPRs_32_105)
+  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
 >;
 
 def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs<
@@ -194,7 +190,7 @@ def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs<
 >;
 
 def CSR_AMDGPU_SI_Gfx : CalleeSavedRegs<
-  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs_4_29, CSR_AMDGPU_SGPRs_30_31, CSR_AMDGPU_SI_Gfx_SGPRs_64_105)
+  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs_4_29, CSR_AMDGPU_SI_Gfx_SGPRs_64_105)
 >;
 
 def CSR_AMDGPU_SI_Gfx_With_AGPRs : CalleeSavedRegs<

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d662bb1d9af9f..54177564afbc7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4398,6 +4398,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(TC_RETURN)
   NODE_NAME_CASE(TRAP)
   NODE_NAME_CASE(RET_FLAG)
+  NODE_NAME_CASE(RET_GFX_FLAG)
   NODE_NAME_CASE(RETURN_TO_EPILOG)
   NODE_NAME_CASE(ENDPGM)
   NODE_NAME_CASE(DWORDADDR)

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 3dee2922501de..daaca8737c5de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -367,6 +367,9 @@ enum NodeType : unsigned {
   // Return with values from a non-entry function.
   RET_FLAG,
 
+  // Return with values from a non-entry function (AMDGPU_Gfx CC).
+  RET_GFX_FLAG,
+
   DWORDADDR,
   FRACT,
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 23b8fcf75f168..391dc84285392 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -355,7 +355,11 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
 def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
-def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
+def AMDGPUret_gfx_flag : SDNode<"AMDGPUISD::RET_GFX_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
   [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
 >;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index d05536ca3894a..3fad7e1921951 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -120,7 +120,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
   // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
   // need to select it to the subtarget specific version, and there's no way to
   // do that with a single pseudo source operation.
-  if (Opcode == AMDGPU::S_SETPC_B64_return)
+  if (Opcode == AMDGPU::S_SETPC_B64_return ||
+      Opcode == AMDGPU::S_SETPC_B64_return_gfx)
     Opcode = AMDGPU::S_SETPC_B64;
   else if (Opcode == AMDGPU::SI_CALL) {
     // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 77937c48a9f1b..9f138136e6e91 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2618,6 +2618,24 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   SmallVector<SDValue, 48> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
 
+  // Add return address for callable functions.
+  if (!Info->isEntryFunction()) {
+    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+    SDValue ReturnAddrReg = CreateLiveInRegister(
+      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
+
+    SDValue ReturnAddrVirtualReg =
+        DAG.getRegister(MF.getRegInfo().createVirtualRegister(
+                            CallConv != CallingConv::AMDGPU_Gfx
+                                ? &AMDGPU::CCR_SGPR_64RegClass
+                                : &AMDGPU::Gfx_CCR_SGPR_64RegClass),
+                        MVT::i64);
+    Chain =
+        DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(ReturnAddrVirtualReg);
+  }
+
   // Copy the result values into the output registers.
   for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
        ++I, ++RealRVLocIdx) {
@@ -2674,8 +2692,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetOps.push_back(Flag);
 
   unsigned Opc = AMDGPUISD::ENDPGM;
-  if (!IsWaveEnd)
-    Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
+  if (!IsWaveEnd) {
+    if (IsShader)
+      Opc = AMDGPUISD::RETURN_TO_EPILOG;
+    else if (CallConv == CallingConv::AMDGPU_Gfx)
+      Opc = AMDGPUISD::RET_GFX_FLAG;
+    else
+      Opc = AMDGPUISD::RET_FLAG;
+  }
+
   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
@@ -3243,6 +3268,21 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
 
+  SDValue PhysReturnAddrReg;
+  if (IsTailCall) {
+    // Since the return is being combined with the call, we need to pass on the
+    // return address.
+
+    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+    SDValue ReturnAddrReg = CreateLiveInRegister(
+      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
+
+    PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
+                                        MVT::i64);
+    Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
   // We don't usually want to end the call-sequence here because we would tidy
   // the frame up *after* the call, however in the ABI-changing tail-call case
   // we've carefully laid out the parameters so that when sp is reset they'll be
@@ -3272,6 +3312,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     // this information must travel along with the operation for eventual
     // consumption by emitEpilogue.
     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
+
+    Ops.push_back(PhysReturnAddrReg);
   }
 
   // Add argument registers to the end of the list so that they are known live

diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b1771e1ff0840..6fbe5d45ce0ae 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -955,8 +955,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
-      MI.getOpcode() == AMDGPU::SI_RETURN ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+      MI.getOpcode() == AMDGPU::S_SETPC_B64_return_gfx ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 92f6456f70e44..4ce2c8a02194d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2071,23 +2071,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
     break;
   }
-  case AMDGPU::SI_RETURN: {
-    const MachineFunction *MF = MBB.getParent();
-    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-    const SIRegisterInfo *TRI = ST.getRegisterInfo();
-    // Hiding the return address use with SI_RETURN may lead to extra kills in
-    // the function and missing live-ins. We are fine in practice because callee
-    // saved register handling ensures the register value is restored before
-    // RET, but we need the undef flag here to appease the MachineVerifier
-    // liveness checks.
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
-            .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
-
-    MIB.copyImplicitOps(MI);
-    MI.eraseFromParent();
-    break;
-  }
   }
   return true;
 }

diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 929d7526d49f6..2bbce1d8ba939 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -463,7 +463,7 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
 
 // Return for returning function calls.
 def SI_RETURN : SPseudoInstSI <
-  (outs), (ins), [(AMDGPUret_flag)],
+  (outs), (ins), [],
   "; return"> {
   let isTerminator = 1;
   let isBarrier = 1;

diff  --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index ccbdefe034d6a..55196fe334e6a 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -79,8 +79,6 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIRegisterInfo *RI = ST.getRegisterInfo();
 
   MachineBasicBlock::iterator I = SaveBlock.begin();
   if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
@@ -91,8 +89,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
       MCRegister Reg = CS.getReg();
 
       MachineInstrSpan MIS(I, &SaveBlock);
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
-          Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
+      const TargetRegisterClass *RC =
+        TRI->getMinimalPhysRegClass(Reg, MVT::i32);
 
       // If this value was already livein, we probably have a direct use of the
       // incoming register value, so don't kill at the spill point. This happens
@@ -121,8 +119,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIRegisterInfo *RI = ST.getRegisterInfo();
+
   // Restore all registers immediately before the return and any
   // terminators that precede it.
   MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator();
@@ -131,8 +128,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
   if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
     for (const CalleeSavedInfo &CI : reverse(CSI)) {
       unsigned Reg = CI.getReg();
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
-          Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
+      const TargetRegisterClass *RC =
+        TRI->getMinimalPhysRegClass(Reg, MVT::i32);
 
       TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI);
       assert(I != RestoreBlock.begin() &&

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 110cfeb09eb51..340e2b48e5cd9 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -701,6 +701,23 @@ def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16],
   let HasSGPR = 1;
 }
 
+// CCR (call clobbered registers) SGPR 64-bit registers
+def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
+                                (add (trunc SGPR_64, 16))> {
+  let CopyCost = SGPR_64.CopyCost;
+  let AllocationPriority = SGPR_64.AllocationPriority;
+  let HasSGPR = 1;
+}
+
+// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC
+def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
+                                (add (trunc (shl SGPR_64, 15), 1), // s[30:31]
+                                     (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63]
+  let CopyCost = SGPR_64.CopyCost;
+  let AllocationPriority = SGPR_64.AllocationPriority;
+  let HasSGPR = 1;
+}
+
 def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
                             (add TTMP_64Regs)> {
   let isAllocatable = 0;

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 0ca679cd506cc..1713586dcf5b7 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -152,8 +152,8 @@ class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
 }
 
 // 64-bit input, no output
-class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
-  opName, (outs), (ins SReg_64:$src0), "$src0", pattern> {
+class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> : SOP1_Pseudo <
+  opName, (outs), (ins rc:$src0), "$src0", pattern> {
   let has_sdst = 0;
 }
 
@@ -300,7 +300,8 @@ def S_SETPC_B64 : SOP1_1  <"s_setpc_b64">;
 
 let isReturn = 1 in {
 // Define variant marked as return rather than branch.
-def S_SETPC_B64_return : SOP1_1<"">;
+def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>;
+def S_SETPC_B64_return_gfx : SOP1_1<"", Gfx_CCR_SGPR_64, [(AMDGPUret_gfx_flag i64:$src0)]>;
 }
 } // End isTerminator = 1, isBarrier = 1
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
index 0c48b0e3585d5..3c6c6523e1002 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
@@ -12,157 +12,193 @@
 name:            test_f32_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_f32_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
     ; GFX9: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX9-CONTRACT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-DENORM-LABEL: name: test_f32_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
     ; GFX9-DENORM: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX9-UNSAFE: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-LABEL: name: test_f32_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
     ; GFX10: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX10-CONTRACT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-DENORM-LABEL: name: test_f32_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
     ; GFX10-DENORM: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX10-UNSAFE: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %4:_(s32) = G_FMUL %0, %1
     %5:_(s32) = G_FADD %4, %2
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
 name:            test_f32_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_f32_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
     ; GFX9: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX9-CONTRACT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
     ; GFX9-DENORM: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX9-UNSAFE: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-LABEL: name: test_f32_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
     ; GFX10: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX10-CONTRACT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
     ; GFX10-DENORM: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX10-UNSAFE: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %4:_(s32) = G_FMUL %0, %1
     %5:_(s32) = G_FADD %2, %4
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
 name:            test_half_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_half_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -171,11 +207,13 @@ body:             |
     ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[FMUL]], [[TRUNC2]]
     ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-CONTRACT-LABEL: name: test_half_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -183,10 +221,12 @@ body:             |
     ; GFX9-CONTRACT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX9-CONTRACT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-DENORM-LABEL: name: test_half_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -194,11 +234,13 @@ body:             |
     ; GFX9-DENORM: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-DENORM: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[FMUL]], [[TRUNC2]]
     ; GFX9-DENORM: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9-DENORM: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-UNSAFE-LABEL: name: test_half_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -206,10 +248,12 @@ body:             |
     ; GFX9-UNSAFE: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-LABEL: name: test_half_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -217,11 +261,13 @@ body:             |
     ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[FMUL]], [[TRUNC2]]
     ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-CONTRACT-LABEL: name: test_half_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -229,10 +275,12 @@ body:             |
     ; GFX10-CONTRACT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX10-CONTRACT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-DENORM-LABEL: name: test_half_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -240,11 +288,13 @@ body:             |
     ; GFX10-DENORM: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-DENORM: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[FMUL]], [[TRUNC2]]
     ; GFX10-DENORM: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10-DENORM: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-UNSAFE-LABEL: name: test_half_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -252,28 +302,32 @@ body:             |
     ; GFX10-UNSAFE: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX10-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %4:_(s32) = COPY $vgpr0
     %0:_(s16) = G_TRUNC %4(s32)
     %5:_(s32) = COPY $vgpr1
     %1:_(s16) = G_TRUNC %5(s32)
     %6:_(s32) = COPY $vgpr2
     %2:_(s16) = G_TRUNC %6(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %7:_(s16) = G_FMUL %0, %1
     %8:_(s16) = G_FADD %7, %2
     %10:_(s32) = G_ANYEXT %8(s16)
     $vgpr0 = COPY %10(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %9:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %9, implicit $vgpr0
 ...
 
 ---
 name:            test_half_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_half_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -282,11 +336,13 @@ body:             |
     ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[TRUNC2]], [[FMUL]]
     ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-CONTRACT-LABEL: name: test_half_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -294,10 +350,12 @@ body:             |
     ; GFX9-CONTRACT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX9-CONTRACT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -305,11 +363,13 @@ body:             |
     ; GFX9-DENORM: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-DENORM: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[TRUNC2]], [[FMUL]]
     ; GFX9-DENORM: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9-DENORM: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-UNSAFE-LABEL: name: test_half_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -317,10 +377,12 @@ body:             |
     ; GFX9-UNSAFE: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-LABEL: name: test_half_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -328,11 +390,13 @@ body:             |
     ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[TRUNC2]], [[FMUL]]
     ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-CONTRACT-LABEL: name: test_half_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -340,10 +404,12 @@ body:             |
     ; GFX10-CONTRACT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX10-CONTRACT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -351,11 +417,13 @@ body:             |
     ; GFX10-DENORM: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-DENORM: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[TRUNC2]], [[FMUL]]
     ; GFX10-DENORM: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10-DENORM: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-UNSAFE-LABEL: name: test_half_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -363,28 +431,32 @@ body:             |
     ; GFX10-UNSAFE: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX10-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %4:_(s32) = COPY $vgpr0
     %0:_(s16) = G_TRUNC %4(s32)
     %5:_(s32) = COPY $vgpr1
     %1:_(s16) = G_TRUNC %5(s32)
     %6:_(s32) = COPY $vgpr2
     %2:_(s16) = G_TRUNC %6(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %7:_(s16) = G_FMUL %0, %1
     %8:_(s16) = G_FADD %2, %7
     %10:_(s32) = G_ANYEXT %8(s16)
     $vgpr0 = COPY %10(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %9:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %9, implicit $vgpr0
 ...
 
 ---
 name:            test_double_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_double_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -396,12 +468,14 @@ body:             |
     ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[MV]], [[MV1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FMUL]], [[MV2]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX9: $vgpr0 = COPY [[UV]](s32)
     ; GFX9: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-CONTRACT-LABEL: name: test_double_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -412,11 +486,13 @@ body:             |
     ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-CONTRACT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-CONTRACT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-DENORM-LABEL: name: test_double_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -427,12 +503,14 @@ body:             |
     ; GFX9-DENORM: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-DENORM: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-DENORM: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[MV]], [[MV1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FMUL]], [[MV2]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX9-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-UNSAFE-LABEL: name: test_double_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -443,11 +521,13 @@ body:             |
     ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-UNSAFE: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-UNSAFE: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-LABEL: name: test_double_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -458,12 +538,14 @@ body:             |
     ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[MV]], [[MV1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FMUL]], [[MV2]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX10: $vgpr0 = COPY [[UV]](s32)
     ; GFX10: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-CONTRACT-LABEL: name: test_double_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -474,11 +556,13 @@ body:             |
     ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-CONTRACT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-CONTRACT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-DENORM-LABEL: name: test_double_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -489,12 +573,14 @@ body:             |
     ; GFX10-DENORM: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-DENORM: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-DENORM: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[MV]], [[MV1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FMUL]], [[MV2]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX10-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-UNSAFE-LABEL: name: test_double_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -505,11 +591,13 @@ body:             |
     ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-UNSAFE: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-UNSAFE: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -519,19 +607,21 @@ body:             |
     %8:_(s32) = COPY $vgpr4
     %9:_(s32) = COPY $vgpr5
     %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %10:_(s64) = G_FMUL %0, %1
     %11:_(s64) = G_FADD %10, %2
     %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
     $vgpr0 = COPY %13(s32)
     $vgpr1 = COPY %14(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %12:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %12, implicit $vgpr0, implicit $vgpr1
 ...
 
 ---
 name:            test_double_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_double_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -543,12 +633,14 @@ body:             |
     ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[MV]], [[MV1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[MV2]], [[FMUL]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX9: $vgpr0 = COPY [[UV]](s32)
     ; GFX9: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-CONTRACT-LABEL: name: test_double_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -559,11 +651,13 @@ body:             |
     ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-CONTRACT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-CONTRACT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -574,12 +668,14 @@ body:             |
     ; GFX9-DENORM: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-DENORM: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-DENORM: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[MV]], [[MV1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[MV2]], [[FMUL]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX9-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-UNSAFE-LABEL: name: test_double_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -590,11 +686,13 @@ body:             |
     ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-UNSAFE: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-UNSAFE: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-LABEL: name: test_double_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -605,12 +703,14 @@ body:             |
     ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[MV]], [[MV1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[MV2]], [[FMUL]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX10: $vgpr0 = COPY [[UV]](s32)
     ; GFX10: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-CONTRACT-LABEL: name: test_double_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -621,11 +721,13 @@ body:             |
     ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-CONTRACT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-CONTRACT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -636,12 +738,14 @@ body:             |
     ; GFX10-DENORM: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-DENORM: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-DENORM: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[MV]], [[MV1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[MV2]], [[FMUL]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX10-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-UNSAFE-LABEL: name: test_double_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -652,11 +756,13 @@ body:             |
     ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-UNSAFE: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-UNSAFE: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -666,19 +772,21 @@ body:             |
     %8:_(s32) = COPY $vgpr4
     %9:_(s32) = COPY $vgpr5
     %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %10:_(s64) = G_FMUL %0, %1
     %11:_(s64) = G_FADD %2, %10
     %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
     $vgpr0 = COPY %13(s32)
     $vgpr1 = COPY %14(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %12:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %12, implicit $vgpr0, implicit $vgpr1
 ...
 
 ---
 name:            test_4xfloat_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_4xfloat_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -696,6 +804,7 @@ body:             |
     ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -703,7 +812,8 @@ body:             |
     ; GFX9: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX9: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX9: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -720,6 +830,7 @@ body:             |
     ; GFX9-CONTRACT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX9-CONTRACT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX9-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-CONTRACT: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-CONTRACT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -727,7 +838,8 @@ body:             |
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-CONTRACT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-CONTRACT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX9-CONTRACT: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -744,6 +856,7 @@ body:             |
     ; GFX9-DENORM: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX9-DENORM: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX9-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-DENORM: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -751,7 +864,8 @@ body:             |
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-DENORM: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-DENORM: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX9-DENORM: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -768,6 +882,7 @@ body:             |
     ; GFX9-UNSAFE: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX9-UNSAFE: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX9-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-UNSAFE: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-UNSAFE: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -775,7 +890,8 @@ body:             |
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-UNSAFE: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-UNSAFE: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX9-UNSAFE: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX10-LABEL: name: test_4xfloat_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -792,6 +908,7 @@ body:             |
     ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX10: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -799,7 +916,8 @@ body:             |
     ; GFX10: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX10: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX10: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -816,6 +934,7 @@ body:             |
     ; GFX10-CONTRACT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX10-CONTRACT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX10-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-CONTRACT: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-CONTRACT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -823,7 +942,8 @@ body:             |
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-CONTRACT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-CONTRACT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX10-CONTRACT: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -840,6 +960,7 @@ body:             |
     ; GFX10-DENORM: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX10-DENORM: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX10-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-DENORM: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -847,7 +968,8 @@ body:             |
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-DENORM: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-DENORM: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX10-DENORM: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -864,6 +986,7 @@ body:             |
     ; GFX10-UNSAFE: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX10-UNSAFE: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX10-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-UNSAFE: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-UNSAFE: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -871,7 +994,8 @@ body:             |
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-UNSAFE: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-UNSAFE: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX10-UNSAFE: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -887,6 +1011,7 @@ body:             |
     %14:_(s32) = COPY $vgpr10
     %15:_(s32) = COPY $vgpr11
     %2:_(<4 x s32>) = G_BUILD_VECTOR %12(s32), %13(s32), %14(s32), %15(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %16:_(<4 x s32>) = G_FMUL %0, %1
     %17:_(<4 x s32>) = G_FADD %16, %2
     %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32) = G_UNMERGE_VALUES %17(<4 x s32>)
@@ -894,14 +1019,15 @@ body:             |
     $vgpr1 = COPY %20(s32)
     $vgpr2 = COPY %21(s32)
     $vgpr3 = COPY %22(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    %18:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %18, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 ...
 
 ---
 name:            test_3xfloat_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -916,13 +1042,15 @@ body:             |
     ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX9: $vgpr0 = COPY [[UV]](s32)
     ; GFX9: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX9: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX9: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -936,13 +1064,15 @@ body:             |
     ; GFX9-CONTRACT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX9-CONTRACT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX9-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9-CONTRACT: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-CONTRACT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-CONTRACT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX9-CONTRACT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -956,13 +1086,15 @@ body:             |
     ; GFX9-DENORM: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX9-DENORM: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX9-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9-DENORM: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX9-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-DENORM: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX9-DENORM: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -976,13 +1108,15 @@ body:             |
     ; GFX9-UNSAFE: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX9-UNSAFE: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX9-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9-UNSAFE: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-UNSAFE: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-UNSAFE: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX9-UNSAFE: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -996,13 +1130,15 @@ body:             |
     ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX10: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX10: $vgpr0 = COPY [[UV]](s32)
     ; GFX10: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX10: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX10: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1016,13 +1152,15 @@ body:             |
     ; GFX10-CONTRACT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX10-CONTRACT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX10-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10-CONTRACT: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-CONTRACT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-CONTRACT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX10-CONTRACT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1036,13 +1174,15 @@ body:             |
     ; GFX10-DENORM: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX10-DENORM: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX10-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10-DENORM: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX10-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-DENORM: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX10-DENORM: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1056,13 +1196,15 @@ body:             |
     ; GFX10-UNSAFE: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX10-UNSAFE: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX10-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10-UNSAFE: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-UNSAFE: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-UNSAFE: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX10-UNSAFE: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -1075,20 +1217,22 @@ body:             |
     %11:_(s32) = COPY $vgpr7
     %12:_(s32) = COPY $vgpr8
     %2:_(<3 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32), %12(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %13:_(<3 x s32>) = G_FMUL %0, %1
     %14:_(<3 x s32>) = G_FADD %2, %13
     %16:_(s32), %17:_(s32), %18:_(s32) = G_UNMERGE_VALUES %14(<3 x s32>)
     $vgpr0 = COPY %16(s32)
     $vgpr1 = COPY %17(s32)
     $vgpr2 = COPY %18(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    %15:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %15, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
 ...
 
 ---
 name:            test_4xhalf_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_4xhalf_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
@@ -1100,12 +1244,14 @@ body:             |
     ; GFX9: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX9: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX9: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-CONTRACT-LABEL: name: test_4xhalf_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1116,12 +1262,14 @@ body:             |
     ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX9-CONTRACT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX9-CONTRACT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX9-CONTRACT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1132,12 +1280,14 @@ body:             |
     ; GFX9-DENORM: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX9-DENORM: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX9-DENORM: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX9-DENORM: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-UNSAFE-LABEL: name: test_4xhalf_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1148,12 +1298,14 @@ body:             |
     ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX9-UNSAFE: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX9-UNSAFE: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX9-UNSAFE: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-LABEL: name: test_4xhalf_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1164,12 +1316,14 @@ body:             |
     ; GFX10: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX10: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX10: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX10: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX10: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-CONTRACT-LABEL: name: test_4xhalf_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1180,12 +1334,14 @@ body:             |
     ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX10-CONTRACT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX10-CONTRACT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX10-CONTRACT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1196,12 +1352,14 @@ body:             |
     ; GFX10-DENORM: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX10-DENORM: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX10-DENORM: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX10-DENORM: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-UNSAFE-LABEL: name: test_4xhalf_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1212,12 +1370,14 @@ body:             |
     ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX10-UNSAFE: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX10-UNSAFE: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX10-UNSAFE: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     %4:_(<2 x s16>) = COPY $vgpr0
     %5:_(<2 x s16>) = COPY $vgpr1
     %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
@@ -1227,19 +1387,21 @@ body:             |
     %8:_(<2 x s16>) = COPY $vgpr4
     %9:_(<2 x s16>) = COPY $vgpr5
     %2:_(<4 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %10:_(<4 x s16>) = G_FMUL %0, %1
     %11:_(<4 x s16>) = G_FADD %10, %2
     %13:_(<2 x s16>), %14:_(<2 x s16>) = G_UNMERGE_VALUES %11(<4 x s16>)
     $vgpr0 = COPY %13(<2 x s16>)
     $vgpr1 = COPY %14(<2 x s16>)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %12:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %12, implicit $vgpr0, implicit $vgpr1
 ...
 
 ---
 name:            test_3xhalf_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
@@ -1261,6 +1423,7 @@ body:             |
     ; GFX9: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX9: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX9: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX9: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX9: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1268,7 +1431,8 @@ body:             |
     ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX9: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1289,6 +1453,7 @@ body:             |
     ; GFX9-CONTRACT: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX9-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX9-CONTRACT: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX9-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX9-CONTRACT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX9-CONTRACT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1296,7 +1461,8 @@ body:             |
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1317,6 +1483,7 @@ body:             |
     ; GFX9-DENORM: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX9-DENORM: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX9-DENORM: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX9-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX9-DENORM: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1324,7 +1491,8 @@ body:             |
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX9-DENORM: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1345,6 +1513,7 @@ body:             |
     ; GFX9-UNSAFE: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX9-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX9-UNSAFE: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX9-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX9-UNSAFE: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX9-UNSAFE: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1352,7 +1521,8 @@ body:             |
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1373,6 +1543,7 @@ body:             |
     ; GFX10: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX10: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX10: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX10: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX10: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1380,7 +1551,8 @@ body:             |
     ; GFX10: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX10: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1401,6 +1573,7 @@ body:             |
     ; GFX10-CONTRACT: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX10-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX10-CONTRACT: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX10-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX10-CONTRACT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX10-CONTRACT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1408,7 +1581,8 @@ body:             |
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1429,6 +1603,7 @@ body:             |
     ; GFX10-DENORM: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX10-DENORM: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX10-DENORM: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX10-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX10-DENORM: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1436,7 +1611,8 @@ body:             |
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX10-DENORM: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1457,6 +1633,7 @@ body:             |
     ; GFX10-UNSAFE: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX10-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX10-UNSAFE: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX10-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX10-UNSAFE: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX10-UNSAFE: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1464,7 +1641,8 @@ body:             |
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     %4:_(<2 x s16>) = COPY $vgpr0
     %5:_(<2 x s16>) = COPY $vgpr1
     %10:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -1478,6 +1656,7 @@ body:             |
     %9:_(<2 x s16>) = COPY $vgpr5
     %15:_(<6 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>), %10(<2 x s16>)
     %2:_(<3 x s16>), %16:_(<3 x s16>) = G_UNMERGE_VALUES %15(<6 x s16>)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %17:_(<3 x s16>) = G_FMUL %0, %1
     %18:_(<3 x s16>) = G_FADD %2, %17
     %22:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1485,14 +1664,15 @@ body:             |
     %20:_(<2 x s16>), %21:_(<2 x s16>), %24:_(<2 x s16>) = G_UNMERGE_VALUES %23(<6 x s16>)
     $vgpr0 = COPY %20(<2 x s16>)
     $vgpr1 = COPY %21(<2 x s16>)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %19:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %19, implicit $vgpr0, implicit $vgpr1
 ...
 
 ---
 name:            test_4xdouble_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_4xdouble_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -1534,6 +1714,7 @@ body:             |
     ; GFX9: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX9: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1545,7 +1726,8 @@ body:             |
     ; GFX9: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9: $vgpr6 = COPY [[UV6]](s32)
     ; GFX9: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX9: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX9: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX9-CONTRACT-LABEL: name: test_4xdouble_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1586,6 +1768,7 @@ body:             |
     ; GFX9-CONTRACT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX9-CONTRACT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX9-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9-CONTRACT: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-CONTRACT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1597,7 +1780,8 @@ body:             |
     ; GFX9-CONTRACT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9-CONTRACT: $vgpr6 = COPY [[UV6]](s32)
     ; GFX9-CONTRACT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX9-CONTRACT: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1638,6 +1822,7 @@ body:             |
     ; GFX9-DENORM: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX9-DENORM: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX9-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9-DENORM: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1649,7 +1834,8 @@ body:             |
     ; GFX9-DENORM: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9-DENORM: $vgpr6 = COPY [[UV6]](s32)
     ; GFX9-DENORM: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX9-DENORM: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX9-UNSAFE-LABEL: name: test_4xdouble_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1690,6 +1876,7 @@ body:             |
     ; GFX9-UNSAFE: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX9-UNSAFE: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX9-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9-UNSAFE: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-UNSAFE: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1701,7 +1888,8 @@ body:             |
     ; GFX9-UNSAFE: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9-UNSAFE: $vgpr6 = COPY [[UV6]](s32)
     ; GFX9-UNSAFE: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX9-UNSAFE: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX10-LABEL: name: test_4xdouble_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1742,6 +1930,7 @@ body:             |
     ; GFX10: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX10: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX10: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1753,7 +1942,8 @@ body:             |
     ; GFX10: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10: $vgpr6 = COPY [[UV6]](s32)
     ; GFX10: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX10: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX10: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX10-CONTRACT-LABEL: name: test_4xdouble_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1794,6 +1984,7 @@ body:             |
     ; GFX10-CONTRACT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX10-CONTRACT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX10-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10-CONTRACT: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-CONTRACT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1805,7 +1996,8 @@ body:             |
     ; GFX10-CONTRACT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10-CONTRACT: $vgpr6 = COPY [[UV6]](s32)
     ; GFX10-CONTRACT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX10-CONTRACT: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1846,6 +2038,7 @@ body:             |
     ; GFX10-DENORM: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX10-DENORM: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX10-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10-DENORM: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1857,7 +2050,8 @@ body:             |
     ; GFX10-DENORM: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10-DENORM: $vgpr6 = COPY [[UV6]](s32)
     ; GFX10-DENORM: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX10-DENORM: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX10-UNSAFE-LABEL: name: test_4xdouble_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1898,6 +2092,7 @@ body:             |
     ; GFX10-UNSAFE: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX10-UNSAFE: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX10-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10-UNSAFE: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-UNSAFE: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1909,7 +2104,8 @@ body:             |
     ; GFX10-UNSAFE: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10-UNSAFE: $vgpr6 = COPY [[UV6]](s32)
     ; GFX10-UNSAFE: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX10-UNSAFE: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -1949,6 +2145,7 @@ body:             |
     %38:_(s64) = G_MERGE_VALUES %24(s32), %25(s32)
     %39:_(s64) = G_MERGE_VALUES %26(s32), %27(s32)
     %2:_(<4 x s64>) = G_BUILD_VECTOR %36(s64), %37(s64), %38(s64), %39(s64)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %40:_(<4 x s64>) = G_FMUL %0, %1
     %41:_(<4 x s64>) = G_FADD %40, %2
     %43:_(s32), %44:_(s32), %45:_(s32), %46:_(s32), %47:_(s32), %48:_(s32), %49:_(s32), %50:_(s32) = G_UNMERGE_VALUES %41(<4 x s64>)
@@ -1960,14 +2157,15 @@ body:             |
     $vgpr5 = COPY %48(s32)
     $vgpr6 = COPY %49(s32)
     $vgpr7 = COPY %50(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    %42:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %42, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
 ...
 
 ---
 name:            test_3xdouble_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -2000,6 +2198,7 @@ body:             |
     ; GFX9: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX9: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -2009,7 +2208,8 @@ body:             |
     ; GFX9: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9: $vgpr4 = COPY [[UV4]](s32)
     ; GFX9: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX9: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX9: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX9-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2041,6 +2241,7 @@ body:             |
     ; GFX9-CONTRACT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX9-CONTRACT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX9-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9-CONTRACT: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-CONTRACT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -2050,7 +2251,8 @@ body:             |
     ; GFX9-CONTRACT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9-CONTRACT: $vgpr4 = COPY [[UV4]](s32)
     ; GFX9-CONTRACT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX9-CONTRACT: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2082,6 +2284,7 @@ body:             |
     ; GFX9-DENORM: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX9-DENORM: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX9-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9-DENORM: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -2091,7 +2294,8 @@ body:             |
     ; GFX9-DENORM: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9-DENORM: $vgpr4 = COPY [[UV4]](s32)
     ; GFX9-DENORM: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX9-DENORM: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX9-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2123,6 +2327,7 @@ body:             |
     ; GFX9-UNSAFE: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX9-UNSAFE: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX9-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9-UNSAFE: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-UNSAFE: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -2132,7 +2337,8 @@ body:             |
     ; GFX9-UNSAFE: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9-UNSAFE: $vgpr4 = COPY [[UV4]](s32)
     ; GFX9-UNSAFE: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX9-UNSAFE: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2164,6 +2370,7 @@ body:             |
     ; GFX10: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX10: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX10: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -2173,7 +2380,8 @@ body:             |
     ; GFX10: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10: $vgpr4 = COPY [[UV4]](s32)
     ; GFX10: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX10: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX10: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX10-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2205,6 +2413,7 @@ body:             |
     ; GFX10-CONTRACT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX10-CONTRACT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX10-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10-CONTRACT: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-CONTRACT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -2214,7 +2423,8 @@ body:             |
     ; GFX10-CONTRACT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10-CONTRACT: $vgpr4 = COPY [[UV4]](s32)
     ; GFX10-CONTRACT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX10-CONTRACT: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2246,6 +2456,7 @@ body:             |
     ; GFX10-DENORM: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX10-DENORM: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX10-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10-DENORM: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -2255,7 +2466,8 @@ body:             |
     ; GFX10-DENORM: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10-DENORM: $vgpr4 = COPY [[UV4]](s32)
     ; GFX10-DENORM: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX10-DENORM: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX10-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2287,6 +2499,7 @@ body:             |
     ; GFX10-UNSAFE: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX10-UNSAFE: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX10-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10-UNSAFE: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-UNSAFE: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -2296,7 +2509,8 @@ body:             |
     ; GFX10-UNSAFE: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10-UNSAFE: $vgpr4 = COPY [[UV4]](s32)
     ; GFX10-UNSAFE: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX10-UNSAFE: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -2327,6 +2541,7 @@ body:             |
     %29:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
     %30:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
     %2:_(<3 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %31:_(<3 x s64>) = G_FMUL %0, %1
     %32:_(<3 x s64>) = G_FADD %2, %31
     %34:_(s32), %35:_(s32), %36:_(s32), %37:_(s32), %38:_(s32), %39:_(s32) = G_UNMERGE_VALUES %32(<3 x s64>)
@@ -2336,5 +2551,6 @@ body:             |
     $vgpr3 = COPY %37(s32)
     $vgpr4 = COPY %38(s32)
     $vgpr5 = COPY %39(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    %33:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %33, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
index d6ef87759e9d7..67d0fd31db6b4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
@@ -12,157 +12,193 @@
 name:            test_f32_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_f32_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = reassoc G_FMUL [[COPY]], [[COPY1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[FMUL]], [[COPY2]]
     ; GFX9: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX9-CONTRACT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-DENORM-LABEL: name: test_f32_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s32) = reassoc G_FMUL [[COPY]], [[COPY1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[FMUL]], [[COPY2]]
     ; GFX9-DENORM: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX9-UNSAFE: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-LABEL: name: test_f32_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = reassoc G_FMUL [[COPY]], [[COPY1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[FMUL]], [[COPY2]]
     ; GFX10: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX10-CONTRACT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-DENORM-LABEL: name: test_f32_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s32) = reassoc G_FMUL [[COPY]], [[COPY1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[FMUL]], [[COPY2]]
     ; GFX10-DENORM: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX10-UNSAFE: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %4:_(s32) = reassoc G_FMUL %0, %1
     %5:_(s32) = reassoc G_FADD %4, %2
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
 name:            test_f32_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_f32_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = reassoc G_FMUL [[COPY]], [[COPY1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY2]], [[FMUL]]
     ; GFX9: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX9-CONTRACT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s32) = reassoc G_FMUL [[COPY]], [[COPY1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY2]], [[FMUL]]
     ; GFX9-DENORM: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX9-UNSAFE: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-LABEL: name: test_f32_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = reassoc G_FMUL [[COPY]], [[COPY1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY2]], [[FMUL]]
     ; GFX10: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX10-CONTRACT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s32) = reassoc G_FMUL [[COPY]], [[COPY1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY2]], [[FMUL]]
     ; GFX10-DENORM: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX10-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; GFX10-UNSAFE: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %4:_(s32) = reassoc G_FMUL %0, %1
     %5:_(s32) = reassoc G_FADD %2, %4
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
 name:            test_half_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_half_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -171,11 +207,13 @@ body:             |
     ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[FMUL]], [[TRUNC2]]
     ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-CONTRACT-LABEL: name: test_half_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -183,10 +221,12 @@ body:             |
     ; GFX9-CONTRACT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX9-CONTRACT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-DENORM-LABEL: name: test_half_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -194,11 +234,13 @@ body:             |
     ; GFX9-DENORM: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-DENORM: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[FMUL]], [[TRUNC2]]
     ; GFX9-DENORM: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9-DENORM: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-UNSAFE-LABEL: name: test_half_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -206,10 +248,12 @@ body:             |
     ; GFX9-UNSAFE: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-LABEL: name: test_half_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -217,11 +261,13 @@ body:             |
     ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[FMUL]], [[TRUNC2]]
     ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-CONTRACT-LABEL: name: test_half_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -229,10 +275,12 @@ body:             |
     ; GFX10-CONTRACT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX10-CONTRACT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-DENORM-LABEL: name: test_half_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -240,11 +288,13 @@ body:             |
     ; GFX10-DENORM: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-DENORM: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[FMUL]], [[TRUNC2]]
     ; GFX10-DENORM: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10-DENORM: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-UNSAFE-LABEL: name: test_half_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -252,28 +302,32 @@ body:             |
     ; GFX10-UNSAFE: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX10-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %4:_(s32) = COPY $vgpr0
     %0:_(s16) = G_TRUNC %4(s32)
     %5:_(s32) = COPY $vgpr1
     %1:_(s16) = G_TRUNC %5(s32)
     %6:_(s32) = COPY $vgpr2
     %2:_(s16) = G_TRUNC %6(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %7:_(s16) = reassoc G_FMUL %0, %1
     %8:_(s16) = reassoc G_FADD %7, %2
     %10:_(s32) = G_ANYEXT %8(s16)
     $vgpr0 = COPY %10(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %9:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %9, implicit $vgpr0
 ...
 
 ---
 name:            test_half_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_half_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -282,11 +336,13 @@ body:             |
     ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
     ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-CONTRACT-LABEL: name: test_half_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -294,10 +350,12 @@ body:             |
     ; GFX9-CONTRACT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX9-CONTRACT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -305,11 +363,13 @@ body:             |
     ; GFX9-DENORM: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-DENORM: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
     ; GFX9-DENORM: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9-DENORM: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX9-UNSAFE-LABEL: name: test_half_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -317,10 +377,12 @@ body:             |
     ; GFX9-UNSAFE: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX9-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-LABEL: name: test_half_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -328,11 +390,13 @@ body:             |
     ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
     ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-CONTRACT-LABEL: name: test_half_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -340,10 +404,12 @@ body:             |
     ; GFX10-CONTRACT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-CONTRACT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-CONTRACT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX10-CONTRACT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -351,11 +417,13 @@ body:             |
     ; GFX10-DENORM: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-DENORM: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-DENORM: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-DENORM: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
     ; GFX10-DENORM: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10-DENORM: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-DENORM: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     ; GFX10-UNSAFE-LABEL: name: test_half_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -363,28 +431,32 @@ body:             |
     ; GFX10-UNSAFE: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX10-UNSAFE: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX10-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-UNSAFE: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; GFX10-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %4:_(s32) = COPY $vgpr0
     %0:_(s16) = G_TRUNC %4(s32)
     %5:_(s32) = COPY $vgpr1
     %1:_(s16) = G_TRUNC %5(s32)
     %6:_(s32) = COPY $vgpr2
     %2:_(s16) = G_TRUNC %6(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %7:_(s16) = reassoc G_FMUL %0, %1
     %8:_(s16) = reassoc G_FADD %2, %7
     %10:_(s32) = G_ANYEXT %8(s16)
     $vgpr0 = COPY %10(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %9:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %9, implicit $vgpr0
 ...
 
 ---
 name:            test_double_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_double_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -396,12 +468,14 @@ body:             |
     ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[MV]], [[MV1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s64) = reassoc G_FADD [[FMUL]], [[MV2]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX9: $vgpr0 = COPY [[UV]](s32)
     ; GFX9: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-CONTRACT-LABEL: name: test_double_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -412,11 +486,13 @@ body:             |
     ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-CONTRACT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-CONTRACT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-DENORM-LABEL: name: test_double_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -427,12 +503,14 @@ body:             |
     ; GFX9-DENORM: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-DENORM: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-DENORM: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[MV]], [[MV1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s64) = reassoc G_FADD [[FMUL]], [[MV2]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX9-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-UNSAFE-LABEL: name: test_double_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -443,11 +521,13 @@ body:             |
     ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-UNSAFE: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-UNSAFE: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-LABEL: name: test_double_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -458,12 +538,14 @@ body:             |
     ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[MV]], [[MV1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s64) = reassoc G_FADD [[FMUL]], [[MV2]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX10: $vgpr0 = COPY [[UV]](s32)
     ; GFX10: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-CONTRACT-LABEL: name: test_double_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -474,11 +556,13 @@ body:             |
     ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-CONTRACT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-CONTRACT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-DENORM-LABEL: name: test_double_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -489,12 +573,14 @@ body:             |
     ; GFX10-DENORM: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-DENORM: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-DENORM: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[MV]], [[MV1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s64) = reassoc G_FADD [[FMUL]], [[MV2]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX10-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-UNSAFE-LABEL: name: test_double_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -505,11 +591,13 @@ body:             |
     ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-UNSAFE: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-UNSAFE: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -519,19 +607,21 @@ body:             |
     %8:_(s32) = COPY $vgpr4
     %9:_(s32) = COPY $vgpr5
     %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %10:_(s64) = reassoc G_FMUL %0, %1
     %11:_(s64) = reassoc G_FADD %10, %2
     %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
     $vgpr0 = COPY %13(s32)
     $vgpr1 = COPY %14(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %12:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %12, implicit $vgpr0, implicit $vgpr1
 ...
 
 ---
 name:            test_double_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_double_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -543,12 +633,14 @@ body:             |
     ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[MV]], [[MV1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(s64) = reassoc G_FADD [[MV2]], [[FMUL]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX9: $vgpr0 = COPY [[UV]](s32)
     ; GFX9: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-CONTRACT-LABEL: name: test_double_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -559,11 +651,13 @@ body:             |
     ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-CONTRACT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-CONTRACT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -574,12 +668,14 @@ body:             |
     ; GFX9-DENORM: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-DENORM: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-DENORM: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[MV]], [[MV1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(s64) = reassoc G_FADD [[MV2]], [[FMUL]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX9-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-UNSAFE-LABEL: name: test_double_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -590,11 +686,13 @@ body:             |
     ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX9-UNSAFE: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX9-UNSAFE: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-LABEL: name: test_double_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -605,12 +703,14 @@ body:             |
     ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[MV]], [[MV1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(s64) = reassoc G_FADD [[MV2]], [[FMUL]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX10: $vgpr0 = COPY [[UV]](s32)
     ; GFX10: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-CONTRACT-LABEL: name: test_double_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -621,11 +721,13 @@ body:             |
     ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-CONTRACT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-CONTRACT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -636,12 +738,14 @@ body:             |
     ; GFX10-DENORM: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-DENORM: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-DENORM: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[MV]], [[MV1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(s64) = reassoc G_FADD [[MV2]], [[FMUL]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](s64)
     ; GFX10-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-UNSAFE-LABEL: name: test_double_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -652,11 +756,13 @@ body:             |
     ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX10-UNSAFE: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; GFX10-UNSAFE: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -666,19 +772,21 @@ body:             |
     %8:_(s32) = COPY $vgpr4
     %9:_(s32) = COPY $vgpr5
     %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %10:_(s64) = reassoc G_FMUL %0, %1
     %11:_(s64) = reassoc G_FADD %2, %10
     %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
     $vgpr0 = COPY %13(s32)
     $vgpr1 = COPY %14(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %12:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %12, implicit $vgpr0, implicit $vgpr1
 ...
 
 ---
 name:            test_4xfloat_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_4xfloat_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -696,6 +804,7 @@ body:             |
     ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -703,7 +812,8 @@ body:             |
     ; GFX9: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX9: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX9: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -720,13 +830,15 @@ body:             |
     ; GFX9-CONTRACT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX9-CONTRACT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX9-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-CONTRACT: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-CONTRACT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-CONTRACT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX9-CONTRACT: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -743,6 +855,7 @@ body:             |
     ; GFX9-DENORM: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX9-DENORM: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX9-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-DENORM: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -750,7 +863,8 @@ body:             |
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-DENORM: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-DENORM: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX9-DENORM: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -767,13 +881,15 @@ body:             |
     ; GFX9-UNSAFE: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX9-UNSAFE: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX9-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-UNSAFE: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-UNSAFE: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-UNSAFE: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX9-UNSAFE: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX10-LABEL: name: test_4xfloat_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -790,6 +906,7 @@ body:             |
     ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX10: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -797,7 +914,8 @@ body:             |
     ; GFX10: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX10: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX10: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -814,13 +932,15 @@ body:             |
     ; GFX10-CONTRACT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX10-CONTRACT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX10-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-CONTRACT: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-CONTRACT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-CONTRACT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX10-CONTRACT: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -837,6 +957,7 @@ body:             |
     ; GFX10-DENORM: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX10-DENORM: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX10-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-DENORM: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
@@ -844,7 +965,8 @@ body:             |
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-DENORM: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-DENORM: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX10-DENORM: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -861,13 +983,15 @@ body:             |
     ; GFX10-UNSAFE: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
     ; GFX10-UNSAFE: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
     ; GFX10-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-UNSAFE: [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-UNSAFE: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-UNSAFE: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ; GFX10-UNSAFE: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY12]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY13]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -883,6 +1007,7 @@ body:             |
     %14:_(s32) = COPY $vgpr10
     %15:_(s32) = COPY $vgpr11
     %2:_(<4 x s32>) = G_BUILD_VECTOR %12(s32), %13(s32), %14(s32), %15(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %16:_(<4 x s32>) = reassoc G_FMUL %0, %1
     %17:_(<4 x s32>) = reassoc G_FADD %16, %2
     %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32) = G_UNMERGE_VALUES %17(<4 x s32>)
@@ -890,14 +1015,15 @@ body:             |
     $vgpr1 = COPY %20(s32)
     $vgpr2 = COPY %21(s32)
     $vgpr3 = COPY %22(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    %18:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %18, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
 ...
 
 ---
 name:            test_3xfloat_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -912,13 +1038,15 @@ body:             |
     ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX9: $vgpr0 = COPY [[UV]](s32)
     ; GFX9: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX9: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX9: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -932,12 +1060,14 @@ body:             |
     ; GFX9-CONTRACT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX9-CONTRACT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX9-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9-CONTRACT: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-CONTRACT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX9-CONTRACT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -951,13 +1081,15 @@ body:             |
     ; GFX9-DENORM: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX9-DENORM: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX9-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9-DENORM: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX9-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-DENORM: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX9-DENORM: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -971,12 +1103,14 @@ body:             |
     ; GFX9-UNSAFE: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX9-UNSAFE: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX9-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9-UNSAFE: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-UNSAFE: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX9-UNSAFE: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -990,13 +1124,15 @@ body:             |
     ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX10: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX10: $vgpr0 = COPY [[UV]](s32)
     ; GFX10: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX10: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX10: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1010,12 +1146,14 @@ body:             |
     ; GFX10-CONTRACT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX10-CONTRACT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX10-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10-CONTRACT: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-CONTRACT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX10-CONTRACT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1029,13 +1167,15 @@ body:             |
     ; GFX10-DENORM: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX10-DENORM: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX10-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10-DENORM: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
     ; GFX10-DENORM: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-DENORM: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX10-DENORM: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1049,12 +1189,14 @@ body:             |
     ; GFX10-UNSAFE: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
     ; GFX10-UNSAFE: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
     ; GFX10-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10-UNSAFE: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-UNSAFE: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX10-UNSAFE: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -1067,20 +1209,22 @@ body:             |
     %11:_(s32) = COPY $vgpr7
     %12:_(s32) = COPY $vgpr8
     %2:_(<3 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32), %12(s32)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %13:_(<3 x s32>) = reassoc G_FMUL %0, %1
     %14:_(<3 x s32>) = reassoc G_FADD %2, %13
     %16:_(s32), %17:_(s32), %18:_(s32) = G_UNMERGE_VALUES %14(<3 x s32>)
     $vgpr0 = COPY %16(s32)
     $vgpr1 = COPY %17(s32)
     $vgpr2 = COPY %18(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    %15:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %15, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
 ...
 
 ---
 name:            test_4xhalf_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_4xhalf_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
@@ -1092,12 +1236,14 @@ body:             |
     ; GFX9: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX9: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<4 x s16>) = reassoc G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<4 x s16>) = reassoc G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX9: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-CONTRACT-LABEL: name: test_4xhalf_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1108,11 +1254,13 @@ body:             |
     ; GFX9-CONTRACT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX9-CONTRACT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX9-CONTRACT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1123,12 +1271,14 @@ body:             |
     ; GFX9-DENORM: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX9-DENORM: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX9-DENORM: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s16>) = reassoc G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<4 x s16>) = reassoc G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX9-DENORM: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-UNSAFE-LABEL: name: test_4xhalf_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1139,11 +1289,13 @@ body:             |
     ; GFX9-UNSAFE: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX9-UNSAFE: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX9-UNSAFE: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-LABEL: name: test_4xhalf_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1154,12 +1306,14 @@ body:             |
     ; GFX10: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX10: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX10: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<4 x s16>) = reassoc G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<4 x s16>) = reassoc G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX10: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX10: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-CONTRACT-LABEL: name: test_4xhalf_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1170,11 +1324,13 @@ body:             |
     ; GFX10-CONTRACT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX10-CONTRACT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX10-CONTRACT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1185,12 +1341,14 @@ body:             |
     ; GFX10-DENORM: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX10-DENORM: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX10-DENORM: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s16>) = reassoc G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<4 x s16>) = reassoc G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
     ; GFX10-DENORM: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-UNSAFE-LABEL: name: test_4xhalf_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1201,11 +1359,13 @@ body:             |
     ; GFX10-UNSAFE: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
     ; GFX10-UNSAFE: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
     ; GFX10-UNSAFE: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     %4:_(<2 x s16>) = COPY $vgpr0
     %5:_(<2 x s16>) = COPY $vgpr1
     %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
@@ -1215,19 +1375,21 @@ body:             |
     %8:_(<2 x s16>) = COPY $vgpr4
     %9:_(<2 x s16>) = COPY $vgpr5
     %2:_(<4 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %10:_(<4 x s16>) = reassoc G_FMUL %0, %1
     %11:_(<4 x s16>) = reassoc G_FADD %10, %2
     %13:_(<2 x s16>), %14:_(<2 x s16>) = G_UNMERGE_VALUES %11(<4 x s16>)
     $vgpr0 = COPY %13(<2 x s16>)
     $vgpr1 = COPY %14(<2 x s16>)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %12:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %12, implicit $vgpr0, implicit $vgpr1
 ...
 
 ---
 name:            test_3xhalf_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
@@ -1249,6 +1411,7 @@ body:             |
     ; GFX9: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX9: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX9: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX9: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<3 x s16>) = reassoc G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<3 x s16>) = reassoc G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX9: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1256,7 +1419,8 @@ body:             |
     ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX9: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1277,13 +1441,15 @@ body:             |
     ; GFX9-CONTRACT: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX9-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX9-CONTRACT: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX9-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[BITCAST1]], [[BITCAST3]], [[BITCAST5]]
     ; GFX9-CONTRACT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
     ; GFX9-CONTRACT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-CONTRACT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1304,6 +1470,7 @@ body:             |
     ; GFX9-DENORM: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX9-DENORM: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX9-DENORM: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX9-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s16>) = reassoc G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<3 x s16>) = reassoc G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX9-DENORM: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1311,7 +1478,8 @@ body:             |
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX9-DENORM: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-DENORM: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX9-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1332,13 +1500,15 @@ body:             |
     ; GFX9-UNSAFE: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX9-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX9-UNSAFE: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX9-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[BITCAST1]], [[BITCAST3]], [[BITCAST5]]
     ; GFX9-UNSAFE: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
     ; GFX9-UNSAFE: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-UNSAFE: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX9-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1359,6 +1529,7 @@ body:             |
     ; GFX10: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX10: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX10: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX10: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<3 x s16>) = reassoc G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<3 x s16>) = reassoc G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX10: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1366,7 +1537,8 @@ body:             |
     ; GFX10: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX10: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1387,13 +1559,15 @@ body:             |
     ; GFX10-CONTRACT: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX10-CONTRACT: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX10-CONTRACT: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX10-CONTRACT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[BITCAST1]], [[BITCAST3]], [[BITCAST5]]
     ; GFX10-CONTRACT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
     ; GFX10-CONTRACT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-CONTRACT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-CONTRACT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1414,6 +1588,7 @@ body:             |
     ; GFX10-DENORM: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX10-DENORM: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX10-DENORM: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX10-DENORM: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s16>) = reassoc G_FMUL [[BITCAST1]], [[BITCAST3]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<3 x s16>) = reassoc G_FADD [[BITCAST5]], [[FMUL]]
     ; GFX10-DENORM: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1421,7 +1596,8 @@ body:             |
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX10-DENORM: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-DENORM: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-DENORM: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     ; GFX10-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1442,13 +1618,15 @@ body:             |
     ; GFX10-UNSAFE: [[BITCAST4:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS2]](<6 x s16>)
     ; GFX10-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s48) = G_TRUNC [[BITCAST4]](s96)
     ; GFX10-UNSAFE: [[BITCAST5:%[0-9]+]]:_(<3 x s16>) = G_BITCAST [[TRUNC2]](s48)
+    ; GFX10-UNSAFE: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[BITCAST1]], [[BITCAST3]], [[BITCAST5]]
     ; GFX10-UNSAFE: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
     ; GFX10-UNSAFE: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-UNSAFE: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; GFX10-UNSAFE: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1
     %4:_(<2 x s16>) = COPY $vgpr0
     %5:_(<2 x s16>) = COPY $vgpr1
     %10:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -1462,6 +1640,7 @@ body:             |
     %9:_(<2 x s16>) = COPY $vgpr5
     %15:_(<6 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>), %10(<2 x s16>)
     %2:_(<3 x s16>), %16:_(<3 x s16>) = G_UNMERGE_VALUES %15(<6 x s16>)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %17:_(<3 x s16>) = reassoc G_FMUL %0, %1
     %18:_(<3 x s16>) = reassoc G_FADD %2, %17
     %22:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1469,14 +1648,15 @@ body:             |
     %20:_(<2 x s16>), %21:_(<2 x s16>), %24:_(<2 x s16>) = G_UNMERGE_VALUES %23(<6 x s16>)
     $vgpr0 = COPY %20(<2 x s16>)
     $vgpr1 = COPY %21(<2 x s16>)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %19:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %19, implicit $vgpr0, implicit $vgpr1
 ...
 
 ---
 name:            test_4xdouble_add_mul
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_4xdouble_add_mul
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -1518,6 +1698,7 @@ body:             |
     ; GFX9: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX9: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<4 x s64>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<4 x s64>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1529,7 +1710,8 @@ body:             |
     ; GFX9: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9: $vgpr6 = COPY [[UV6]](s32)
     ; GFX9: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX9: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX9: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX9-CONTRACT-LABEL: name: test_4xdouble_add_mul
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1570,6 +1752,7 @@ body:             |
     ; GFX9-CONTRACT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX9-CONTRACT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX9-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9-CONTRACT: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](s32)
@@ -1580,7 +1763,8 @@ body:             |
     ; GFX9-CONTRACT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9-CONTRACT: $vgpr6 = COPY [[UV6]](s32)
     ; GFX9-CONTRACT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX9-CONTRACT: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1621,6 +1805,7 @@ body:             |
     ; GFX9-DENORM: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX9-DENORM: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX9-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9-DENORM: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s64>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<4 x s64>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1632,7 +1817,8 @@ body:             |
     ; GFX9-DENORM: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9-DENORM: $vgpr6 = COPY [[UV6]](s32)
     ; GFX9-DENORM: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX9-DENORM: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX9-UNSAFE-LABEL: name: test_4xdouble_add_mul
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1673,6 +1859,7 @@ body:             |
     ; GFX9-UNSAFE: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX9-UNSAFE: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX9-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9-UNSAFE: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](s32)
@@ -1683,7 +1870,8 @@ body:             |
     ; GFX9-UNSAFE: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9-UNSAFE: $vgpr6 = COPY [[UV6]](s32)
     ; GFX9-UNSAFE: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX9-UNSAFE: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX10-LABEL: name: test_4xdouble_add_mul
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1724,6 +1912,7 @@ body:             |
     ; GFX10: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX10: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX10: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<4 x s64>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<4 x s64>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1735,7 +1924,8 @@ body:             |
     ; GFX10: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10: $vgpr6 = COPY [[UV6]](s32)
     ; GFX10: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX10: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX10: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX10-CONTRACT-LABEL: name: test_4xdouble_add_mul
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1776,6 +1966,7 @@ body:             |
     ; GFX10-CONTRACT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX10-CONTRACT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX10-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10-CONTRACT: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](s32)
@@ -1786,7 +1977,8 @@ body:             |
     ; GFX10-CONTRACT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10-CONTRACT: $vgpr6 = COPY [[UV6]](s32)
     ; GFX10-CONTRACT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX10-CONTRACT: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1827,6 +2019,7 @@ body:             |
     ; GFX10-DENORM: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX10-DENORM: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX10-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10-DENORM: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<4 x s64>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<4 x s64>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
@@ -1838,7 +2031,8 @@ body:             |
     ; GFX10-DENORM: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10-DENORM: $vgpr6 = COPY [[UV6]](s32)
     ; GFX10-DENORM: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX10-DENORM: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ; GFX10-UNSAFE-LABEL: name: test_4xdouble_add_mul
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1879,6 +2073,7 @@ body:             |
     ; GFX10-UNSAFE: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
     ; GFX10-UNSAFE: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
     ; GFX10-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10-UNSAFE: [[COPY24:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](s32)
@@ -1889,7 +2084,8 @@ body:             |
     ; GFX10-UNSAFE: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10-UNSAFE: $vgpr6 = COPY [[UV6]](s32)
     ; GFX10-UNSAFE: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ; GFX10-UNSAFE: [[COPY25:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY24]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY25]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -1929,6 +2125,7 @@ body:             |
     %38:_(s64) = G_MERGE_VALUES %24(s32), %25(s32)
     %39:_(s64) = G_MERGE_VALUES %26(s32), %27(s32)
     %2:_(<4 x s64>) = G_BUILD_VECTOR %36(s64), %37(s64), %38(s64), %39(s64)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %40:_(<4 x s64>) = reassoc G_FMUL %0, %1
     %41:_(<4 x s64>) = reassoc G_FADD %40, %2
     %43:_(s32), %44:_(s32), %45:_(s32), %46:_(s32), %47:_(s32), %48:_(s32), %49:_(s32), %50:_(s32) = G_UNMERGE_VALUES %41(<4 x s64>)
@@ -1940,14 +2137,15 @@ body:             |
     $vgpr5 = COPY %48(s32)
     $vgpr6 = COPY %49(s32)
     $vgpr7 = COPY %50(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    %42:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %42, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
 ...
 
 ---
 name:            test_3xdouble_add_mul_rhs
 body:             |
   bb.1.entry:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $sgpr30_sgpr31
 
     ; GFX9-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -1980,6 +2178,7 @@ body:             |
     ; GFX9: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX9: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX9: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9: [[FMUL:%[0-9]+]]:_(<3 x s64>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9: [[FADD:%[0-9]+]]:_(<3 x s64>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -1989,7 +2188,8 @@ body:             |
     ; GFX9: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9: $vgpr4 = COPY [[UV4]](s32)
     ; GFX9: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX9: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX9: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX9-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX9-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2021,6 +2221,7 @@ body:             |
     ; GFX9-CONTRACT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX9-CONTRACT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX9-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9-CONTRACT: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-CONTRACT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX9-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
     ; GFX9-CONTRACT: $vgpr0 = COPY [[UV]](s32)
@@ -2029,7 +2230,8 @@ body:             |
     ; GFX9-CONTRACT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9-CONTRACT: $vgpr4 = COPY [[UV4]](s32)
     ; GFX9-CONTRACT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX9-CONTRACT: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX9-CONTRACT: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2061,6 +2263,7 @@ body:             |
     ; GFX9-DENORM: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX9-DENORM: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX9-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9-DENORM: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s64>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX9-DENORM: [[FADD:%[0-9]+]]:_(<3 x s64>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX9-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -2070,7 +2273,8 @@ body:             |
     ; GFX9-DENORM: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9-DENORM: $vgpr4 = COPY [[UV4]](s32)
     ; GFX9-DENORM: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX9-DENORM: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX9-DENORM: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX9-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2102,6 +2306,7 @@ body:             |
     ; GFX9-UNSAFE: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX9-UNSAFE: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX9-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9-UNSAFE: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[UV]](s32)
@@ -2110,7 +2315,8 @@ body:             |
     ; GFX9-UNSAFE: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9-UNSAFE: $vgpr4 = COPY [[UV4]](s32)
     ; GFX9-UNSAFE: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX9-UNSAFE: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX9-UNSAFE: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2142,6 +2348,7 @@ body:             |
     ; GFX10: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX10: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX10: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10: [[FMUL:%[0-9]+]]:_(<3 x s64>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10: [[FADD:%[0-9]+]]:_(<3 x s64>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -2151,7 +2358,8 @@ body:             |
     ; GFX10: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10: $vgpr4 = COPY [[UV4]](s32)
     ; GFX10: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX10: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX10: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX10-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10-CONTRACT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-CONTRACT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2183,6 +2391,7 @@ body:             |
     ; GFX10-CONTRACT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX10-CONTRACT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX10-CONTRACT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10-CONTRACT: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-CONTRACT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX10-CONTRACT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
     ; GFX10-CONTRACT: $vgpr0 = COPY [[UV]](s32)
@@ -2191,7 +2400,8 @@ body:             |
     ; GFX10-CONTRACT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10-CONTRACT: $vgpr4 = COPY [[UV4]](s32)
     ; GFX10-CONTRACT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-CONTRACT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX10-CONTRACT: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX10-CONTRACT: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-DENORM: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2223,6 +2433,7 @@ body:             |
     ; GFX10-DENORM: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX10-DENORM: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX10-DENORM: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10-DENORM: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-DENORM: [[FMUL:%[0-9]+]]:_(<3 x s64>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; GFX10-DENORM: [[FADD:%[0-9]+]]:_(<3 x s64>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
     ; GFX10-DENORM: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
@@ -2232,7 +2443,8 @@ body:             |
     ; GFX10-DENORM: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10-DENORM: $vgpr4 = COPY [[UV4]](s32)
     ; GFX10-DENORM: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-DENORM: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX10-DENORM: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX10-DENORM: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ; GFX10-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2264,6 +2476,7 @@ body:             |
     ; GFX10-UNSAFE: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
     ; GFX10-UNSAFE: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
     ; GFX10-UNSAFE: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10-UNSAFE: [[COPY18:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; GFX10-UNSAFE: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
     ; GFX10-UNSAFE: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
     ; GFX10-UNSAFE: $vgpr0 = COPY [[UV]](s32)
@@ -2272,7 +2485,8 @@ body:             |
     ; GFX10-UNSAFE: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10-UNSAFE: $vgpr4 = COPY [[UV4]](s32)
     ; GFX10-UNSAFE: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-UNSAFE: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ; GFX10-UNSAFE: [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY18]]
+    ; GFX10-UNSAFE: S_SETPC_B64_return [[COPY19]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -2303,6 +2517,7 @@ body:             |
     %29:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
     %30:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
     %2:_(<3 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64)
+    %3:sgpr_64 = COPY $sgpr30_sgpr31
     %31:_(<3 x s64>) = reassoc G_FMUL %0, %1
     %32:_(<3 x s64>) = reassoc G_FADD %2, %31
     %34:_(s32), %35:_(s32), %36:_(s32), %37:_(s32), %38:_(s32), %39:_(s32) = G_UNMERGE_VALUES %32(<3 x s64>)
@@ -2312,5 +2527,6 @@ body:             |
     $vgpr3 = COPY %37(s32)
     $vgpr4 = COPY %38(s32)
     $vgpr5 = COPY %39(s32)
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    %33:ccr_sgpr_64 = COPY %3
+    S_SETPC_B64_return %33, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-illegal-types.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-illegal-types.mir
index dcec23c030c46..e271229aa1fd4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-illegal-types.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-illegal-types.mir
@@ -283,9 +283,10 @@ body:             |
     liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: ushlsat_i44
-    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
     ; CHECK: [[TRUNC:%[0-9]+]]:_(s44) = G_TRUNC [[MV]](s64)
     ; CHECK: [[C:%[0-9]+]]:_(s44) = G_CONSTANT i44 22
@@ -295,9 +296,11 @@ body:             |
     ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s64)
     ; CHECK: $vgpr0 = COPY [[UV]](s32)
     ; CHECK: $vgpr1 = COPY [[UV1]](s32)
-    ; CHECK: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+    ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0, implicit $vgpr1
     %2:_(s32) = COPY $vgpr0
     %3:_(s32) = COPY $vgpr1
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %4:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
     %0:_(s44) = G_TRUNC %4(s64)
     %5:_(s44) = G_CONSTANT i44 22
@@ -307,7 +310,8 @@ body:             |
     %10:_(s32), %11:_(s32) = G_UNMERGE_VALUES %9(s64)
     $vgpr0 = COPY %10(s32)
     $vgpr1 = COPY %11(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %8:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %8, implicit $vgpr0, implicit $vgpr1
 
 ...
 ---
@@ -315,12 +319,13 @@ name:            ushlsat_i55
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0, $vgpr1
+    liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: ushlsat_i55
-    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
     ; CHECK: [[TRUNC:%[0-9]+]]:_(s55) = G_TRUNC [[MV]](s64)
     ; CHECK: [[C:%[0-9]+]]:_(s55) = G_CONSTANT i55 53
@@ -329,9 +334,11 @@ body:             |
     ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s64)
     ; CHECK: $vgpr0 = COPY [[UV]](s32)
     ; CHECK: $vgpr1 = COPY [[UV1]](s32)
-    ; CHECK: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+    ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0, implicit $vgpr1
     %2:_(s32) = COPY $vgpr0
     %3:_(s32) = COPY $vgpr1
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %4:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
     %0:_(s55) = G_TRUNC %4(s64)
     %5:_(s55) = G_CONSTANT i55 50
@@ -342,6 +349,7 @@ body:             |
     %11:_(s32), %12:_(s32) = G_UNMERGE_VALUES %10(s64)
     $vgpr0 = COPY %11(s32)
     $vgpr1 = COPY %12(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %9:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %9, implicit $vgpr0, implicit $vgpr1
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
index 5dfde116785db..8a957d1019125 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
@@ -6,14 +6,15 @@
 define i16 @vop3p_add_i16(i16 %arg0) #0 {
   ; CHECK-LABEL: name: vop3p_add_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC]]
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC]]
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
   %add = add i16 %arg0, %arg0
   ret i16 %add
 }
@@ -21,27 +22,28 @@ define i16 @vop3p_add_i16(i16 %arg0) #0 {
 define <2 x i16> @vop3p_add_v2i16(<2 x i16> %arg0) #0 {
   ; CHECK-LABEL: name: vop3p_add_v2i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-  ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; CHECK-NEXT:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-  ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-  ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-  ; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-  ; CHECK-NEXT:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-  ; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC2]]
-  ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[TRUNC3]]
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16)
-  ; CHECK-NEXT:   [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16)
-  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-  ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
-  ; CHECK-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+  ; CHECK:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; CHECK:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; CHECK:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+  ; CHECK:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+  ; CHECK:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+  ; CHECK:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; CHECK:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+  ; CHECK:   [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC2]]
+  ; CHECK:   [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[TRUNC3]]
+  ; CHECK:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16)
+  ; CHECK:   [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16)
+  ; CHECK:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+  ; CHECK:   [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
+  ; CHECK:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+  ; CHECK:   $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
   %add = add <2 x i16> %arg0, %arg0
   ret <2 x i16> %add
 }
@@ -49,12 +51,13 @@ define <2 x i16> @vop3p_add_v2i16(<2 x i16> %arg0) #0 {
 define i16 @halfinsts_add_i16(i16 %arg0) #1 {
   ; CHECK-LABEL: name: halfinsts_add_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY]]
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ADD]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY]]
+  ; CHECK:   $vgpr0 = COPY [[ADD]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
   %add = add i16 %arg0, %arg0
   ret i16 %add
 }
@@ -62,15 +65,16 @@ define i16 @halfinsts_add_i16(i16 %arg0) #1 {
 define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 {
   ; CHECK-LABEL: name: halfinsts_add_v2i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY]]
-  ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ADD]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[ADD1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY]]
+  ; CHECK:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY1]]
+  ; CHECK:   $vgpr0 = COPY [[ADD]](s32)
+  ; CHECK:   $vgpr1 = COPY [[ADD1]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0, implicit $vgpr1
   %add = add <2 x i16> %arg0, %arg0
   ret <2 x i16> %add
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 140c3ff694079..7cd475bf00987 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -667,8 +667,8 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX89-IEEE-NEXT:    v_fma_f32 v10, v12, v8, v10
 ; GFX89-IEEE-NEXT:    v_fma_f32 v11, v13, v9, v11
 ; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v4, v10, v6
-; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v8, v10
+; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v5, v5, v9, v11
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
@@ -860,8 +860,8 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX89-IEEE-NEXT:    v_fma_f32 v10, v12, v8, v10
 ; GFX89-IEEE-NEXT:    v_fma_f32 v11, v13, v9, v11
 ; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v4, v10, v6
-; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v8, v10
+; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v5, v5, v9, v11
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
@@ -1451,8 +1451,8 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX89-IEEE-NEXT:    v_fma_f32 v10, v12, v8, v10
 ; GFX89-IEEE-NEXT:    v_fma_f32 v11, v13, v9, v11
 ; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v4, v10, v6
-; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v8, v10
+; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v5, v5, v9, v11
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
index 39153cb8a744e..5431744849247 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
@@ -837,34 +837,33 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
-; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
+; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], 1.0, v[0:1], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
-; GFX6-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v11
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; GFX6-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
+; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0
-; GFX6-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
-; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v9
-; GFX6-NEXT:    v_mul_f64 v[12:13], v[8:9], v[6:7]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9]
-; GFX6-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0
-; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
-; GFX6-NEXT:    v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; GFX6-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
+; GFX6-NEXT:    v_mul_f64 v[14:15], v[10:11], v[6:7]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_mul_f64 v[14:15], v[16:17], v[4:5]
-; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13]
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17]
+; GFX6-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[14:15], v[10:11]
+; GFX6-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[10:11], v[6:7], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
+; GFX6-NEXT:    v_fma_f64 v[4:5], -v[8:9], v[12:13], 1.0
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
+; GFX6-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[12:13]
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v17
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v11
+; GFX6-NEXT:    v_mul_f64 v[12:13], v[16:17], v[4:5]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; GFX6-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[12:13], v[16:17]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
-; GFX6-NEXT:    s_nop 0
-; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15]
+; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[10:11], v[4:5], v[12:13]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -961,34 +960,33 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
-; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
+; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], 1.0, v[0:1], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
-; GFX6-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v11
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; GFX6-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
+; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0
-; GFX6-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
-; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v9
-; GFX6-NEXT:    v_mul_f64 v[12:13], v[8:9], v[6:7]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9]
-; GFX6-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0
-; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
-; GFX6-NEXT:    v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; GFX6-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
+; GFX6-NEXT:    v_mul_f64 v[14:15], v[10:11], v[6:7]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_mul_f64 v[14:15], v[16:17], v[4:5]
-; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13]
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17]
+; GFX6-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[14:15], v[10:11]
+; GFX6-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[10:11], v[6:7], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
+; GFX6-NEXT:    v_fma_f64 v[4:5], -v[8:9], v[12:13], 1.0
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
+; GFX6-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[12:13]
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v17
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v11
+; GFX6-NEXT:    v_mul_f64 v[12:13], v[16:17], v[4:5]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; GFX6-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[12:13], v[16:17]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
-; GFX6-NEXT:    s_nop 0
-; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15]
+; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[10:11], v[4:5], v[12:13]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1132,34 +1130,33 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
-; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
+; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], 1.0, v[0:1], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
-; GFX6-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v11
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; GFX6-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
+; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0
-; GFX6-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
-; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v9
-; GFX6-NEXT:    v_mul_f64 v[12:13], v[8:9], v[6:7]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9]
-; GFX6-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0
-; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
-; GFX6-NEXT:    v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; GFX6-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
+; GFX6-NEXT:    v_mul_f64 v[14:15], v[10:11], v[6:7]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_mul_f64 v[14:15], v[16:17], v[4:5]
-; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13]
-; GFX6-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17]
+; GFX6-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[14:15], v[10:11]
+; GFX6-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX6-NEXT:    v_div_fmas_f64 v[6:7], v[10:11], v[6:7], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0
+; GFX6-NEXT:    v_fma_f64 v[4:5], -v[8:9], v[12:13], 1.0
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
+; GFX6-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[12:13]
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v17
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v11
+; GFX6-NEXT:    v_mul_f64 v[12:13], v[16:17], v[4:5]
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; GFX6-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[12:13], v[16:17]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
-; GFX6-NEXT:    s_nop 0
-; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15]
+; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[10:11], v[4:5], v[12:13]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 4667975ca0d01..bc2c5c7371046 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -188,11 +188,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT:    v_exp_f16_e32 v1, v1
+; GFX9-NEXT:    v_exp_f16_e32 v1, v2
 ; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -274,11 +274,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT:    v_exp_f16_e32 v1, v1
+; GFX9-NEXT:    v_exp_f16_e32 v1, v2
 ; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -358,8 +358,8 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index f0f61e1803bd9..09e49cedca486 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -2094,18 +2094,18 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v6
 ; GFX9-NEXT:    v_bfe_u32 v2, v2, 1, 23
+; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v6
 ; GFX9-NEXT:    v_bfe_u32 v3, v3, 1, 23
-; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX9-NEXT:    v_mul_hi_u32 v7, v9, v7
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffffff
-; GFX9-NEXT:    v_and_b32_e32 v5, v5, v8
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffffff
+; GFX9-NEXT:    v_and_b32_e32 v5, v5, v9
+; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX9-NEXT:    v_mul_hi_u32 v7, v8, v7
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
@@ -2113,11 +2113,11 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, v9, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, v8, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX9-NEXT:    v_sub_u32_e32 v7, 23, v4
-; GFX9-NEXT:    v_and_b32_e32 v7, v7, v8
-; GFX9-NEXT:    v_and_b32_e32 v4, v4, v8
+; GFX9-NEXT:    v_and_b32_e32 v7, v7, v9
+; GFX9-NEXT:    v_and_b32_e32 v4, v4, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v7, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
@@ -2129,8 +2129,8 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 23, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, v4, v8
-; GFX9-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX9-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v1, v2, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index ff6618f8534b2..b570be394ca4e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -2100,40 +2100,40 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v9
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
 ; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v6
+; GFX9-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v9
+; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX9-NEXT:    v_mul_hi_u32 v7, v9, v7
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffffff
-; GFX9-NEXT:    v_and_b32_e32 v5, v5, v8
-; GFX9-NEXT:    v_add_u32_e32 v7, v9, v7
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffffff
+; GFX9-NEXT:    v_and_b32_e32 v5, v5, v9
+; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX9-NEXT:    v_and_b32_e32 v2, v2, v8
-; GFX9-NEXT:    v_and_b32_e32 v3, v3, v8
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX9-NEXT:    v_and_b32_e32 v3, v3, v9
+; GFX9-NEXT:    v_mul_hi_u32 v7, v8, v7
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 23, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, v4, v8
-; GFX9-NEXT:    v_and_b32_e32 v6, v6, v8
+; GFX9-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX9-NEXT:    v_and_b32_e32 v6, v6, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v7
@@ -2144,8 +2144,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 23, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, v2, v8
-; GFX9-NEXT:    v_and_b32_e32 v4, v4, v8
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX9-NEXT:    v_and_b32_e32 v4, v4, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4095,8 +4095,8 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX10-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX10-NEXT:    v_and_b32_e32 v6, s4, v6
 ; GFX10-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v6, v0
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v3, v5, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
index 53c3fa1064a6b..3dfd3ab5a15ad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
@@ -6,11 +6,14 @@
 define i1 @i1_func_void() #0 {
   ; CHECK-LABEL: name: i1_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `i1 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `i1 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s1)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i1, i1 addrspace(1)* undef
   ret i1 %val
 }
@@ -18,11 +21,14 @@ define i1 @i1_func_void() #0 {
 define zeroext i1 @i1_zeroext_func_void() #0 {
   ; CHECK-LABEL: name: i1_zeroext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `i1 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `i1 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1)
+  ; CHECK:   $vgpr0 = COPY [[ZEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i1, i1 addrspace(1)* undef
   ret i1 %val
 }
@@ -30,11 +36,14 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
 define signext i1 @i1_signext_func_void() #0 {
   ; CHECK-LABEL: name: i1_signext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `i1 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load (s1) from `i1 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1)
+  ; CHECK:   $vgpr0 = COPY [[SEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i1, i1 addrspace(1)* undef
   ret i1 %val
 }
@@ -42,11 +51,14 @@ define signext i1 @i1_signext_func_void() #0 {
 define i7 @i7_func_void() #0 {
   ; CHECK-LABEL: name: i7_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s7) = G_LOAD [[DEF]](p1) :: (load (s7) from `i7 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s7)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s7) = G_LOAD [[DEF]](p1) :: (load (s7) from `i7 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s7)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i7, i7 addrspace(1)* undef
   ret i7 %val
 }
@@ -54,11 +66,14 @@ define i7 @i7_func_void() #0 {
 define zeroext i7 @i7_zeroext_func_void() #0 {
   ; CHECK-LABEL: name: i7_zeroext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s7) = G_LOAD [[DEF]](p1) :: (load (s7) from `i7 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s7)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s7) = G_LOAD [[DEF]](p1) :: (load (s7) from `i7 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s7)
+  ; CHECK:   $vgpr0 = COPY [[ZEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i7, i7 addrspace(1)* undef
   ret i7 %val
 }
@@ -66,11 +81,14 @@ define zeroext i7 @i7_zeroext_func_void() #0 {
 define signext i7 @i7_signext_func_void() #0 {
   ; CHECK-LABEL: name: i7_signext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s7) = G_LOAD [[DEF]](p1) :: (load (s7) from `i7 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s7)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s7) = G_LOAD [[DEF]](p1) :: (load (s7) from `i7 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s7)
+  ; CHECK:   $vgpr0 = COPY [[SEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i7, i7 addrspace(1)* undef
   ret i7 %val
 }
@@ -78,11 +96,14 @@ define signext i7 @i7_signext_func_void() #0 {
 define i8 @i8_func_void() #0 {
   ; CHECK-LABEL: name: i8_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s8)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s8)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i8, i8 addrspace(1)* undef
   ret i8 %val
 }
@@ -90,11 +111,14 @@ define i8 @i8_func_void() #0 {
 define zeroext i8 @i8_zeroext_func_void() #0 {
   ; CHECK-LABEL: name: i8_zeroext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s8)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s8)
+  ; CHECK:   $vgpr0 = COPY [[ZEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i8, i8 addrspace(1)* undef
   ret i8 %val
 }
@@ -102,11 +126,14 @@ define zeroext i8 @i8_zeroext_func_void() #0 {
 define signext i8 @i8_signext_func_void() #0 {
   ; CHECK-LABEL: name: i8_signext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s8)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s8)
+  ; CHECK:   $vgpr0 = COPY [[SEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i8, i8 addrspace(1)* undef
   ret i8 %val
 }
@@ -114,11 +141,14 @@ define signext i8 @i8_signext_func_void() #0 {
 define i16 @i16_func_void() #0 {
   ; CHECK-LABEL: name: i16_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s16)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s16)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i16, i16 addrspace(1)* undef
   ret i16 %val
 }
@@ -126,11 +156,14 @@ define i16 @i16_func_void() #0 {
 define zeroext i16 @i16_zeroext_func_void() #0 {
   ; CHECK-LABEL: name: i16_zeroext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s16)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s16)
+  ; CHECK:   $vgpr0 = COPY [[ZEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i16, i16 addrspace(1)* undef
   ret i16 %val
 }
@@ -138,11 +171,14 @@ define zeroext i16 @i16_zeroext_func_void() #0 {
 define signext i16 @i16_signext_func_void() #0 {
   ; CHECK-LABEL: name: i16_signext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s16)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s16)
+  ; CHECK:   $vgpr0 = COPY [[SEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i16, i16 addrspace(1)* undef
   ret i16 %val
 }
@@ -150,11 +186,14 @@ define signext i16 @i16_signext_func_void() #0 {
 define half @f16_func_void() #0 {
   ; CHECK-LABEL: name: f16_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (load (s16) from `half addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s16)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (load (s16) from `half addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s16)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load half, half addrspace(1)* undef
   ret half %val
 }
@@ -162,11 +201,14 @@ define half @f16_func_void() #0 {
 define i24 @i24_func_void() #0 {
   ; CHECK-LABEL: name: i24_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load (s24) from `i24 addrspace(1)* undef`, align 4, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s24)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load (s24) from `i24 addrspace(1)* undef`, align 4, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s24)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i24, i24 addrspace(1)* undef
   ret i24 %val
 }
@@ -174,11 +216,14 @@ define i24 @i24_func_void() #0 {
 define zeroext i24 @i24_zeroext_func_void() #0 {
   ; CHECK-LABEL: name: i24_zeroext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load (s24) from `i24 addrspace(1)* undef`, align 4, addrspace 1)
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s24)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load (s24) from `i24 addrspace(1)* undef`, align 4, addrspace 1)
+  ; CHECK:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s24)
+  ; CHECK:   $vgpr0 = COPY [[ZEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i24, i24 addrspace(1)* undef
   ret i24 %val
 }
@@ -186,11 +231,14 @@ define zeroext i24 @i24_zeroext_func_void() #0 {
 define signext i24 @i24_signext_func_void() #0 {
   ; CHECK-LABEL: name: i24_signext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load (s24) from `i24 addrspace(1)* undef`, align 4, addrspace 1)
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s24)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[SEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load (s24) from `i24 addrspace(1)* undef`, align 4, addrspace 1)
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s24)
+  ; CHECK:   $vgpr0 = COPY [[SEXT]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i24, i24 addrspace(1)* undef
   ret i24 %val
 }
@@ -198,14 +246,17 @@ define signext i24 @i24_signext_func_void() #0 {
 define <2 x i24> @v2i24_func_void() #0 {
   ; CHECK-LABEL: name: v2i24_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s24>) = G_LOAD [[DEF]](p1) :: (load (<2 x s24>) from `<2 x i24> addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s24), [[UV1:%[0-9]+]]:_(s24) = G_UNMERGE_VALUES [[LOAD]](<2 x s24>)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s24)
-  ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s24)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[ANYEXT1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<2 x s24>) = G_LOAD [[DEF]](p1) :: (load (<2 x s24>) from `<2 x i24> addrspace(1)* undef`, align 8, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s24), [[UV1:%[0-9]+]]:_(s24) = G_UNMERGE_VALUES [[LOAD]](<2 x s24>)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s24)
+  ; CHECK:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s24)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   $vgpr1 = COPY [[ANYEXT1]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load <2 x i24>, <2 x i24> addrspace(1)* undef
   ret <2 x i24> %val
 }
@@ -213,16 +264,19 @@ define <2 x i24> @v2i24_func_void() #0 {
 define <3 x i24> @v3i24_func_void() #0 {
   ; CHECK-LABEL: name: v3i24_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<3 x s24>) = G_LOAD [[DEF]](p1) :: (load (<3 x s24>) from `<3 x i24> addrspace(1)* undef`, align 16, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s24), [[UV1:%[0-9]+]]:_(s24), [[UV2:%[0-9]+]]:_(s24) = G_UNMERGE_VALUES [[LOAD]](<3 x s24>)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s24)
-  ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s24)
-  ; CHECK-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s24)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[ANYEXT1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[ANYEXT2]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<3 x s24>) = G_LOAD [[DEF]](p1) :: (load (<3 x s24>) from `<3 x i24> addrspace(1)* undef`, align 16, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s24), [[UV1:%[0-9]+]]:_(s24), [[UV2:%[0-9]+]]:_(s24) = G_UNMERGE_VALUES [[LOAD]](<3 x s24>)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s24)
+  ; CHECK:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s24)
+  ; CHECK:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s24)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   $vgpr1 = COPY [[ANYEXT1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[ANYEXT2]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %val = load <3 x i24>, <3 x i24> addrspace(1)* undef
   ret <3 x i24> %val
 }
@@ -230,10 +284,13 @@ define <3 x i24> @v3i24_func_void() #0 {
 define i32 @i32_func_void() #0 {
   ; CHECK-LABEL: name: i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p1) :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[LOAD]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p1) :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   $vgpr0 = COPY [[LOAD]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load i32, i32 addrspace(1)* undef
   ret i32 %val
 }
@@ -241,13 +298,16 @@ define i32 @i32_func_void() #0 {
 define i48 @i48_func_void() #0 {
   ; CHECK-LABEL: name: i48_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s48)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s64)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s48)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s64)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load i48, i48 addrspace(1)* undef, align 8
   ret i48 %val
 }
@@ -255,13 +315,16 @@ define i48 @i48_func_void() #0 {
 define signext i48 @i48_signext_func_void() #0 {
   ; CHECK-LABEL: name: i48_signext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD]](s48)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SEXT]](s64)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD]](s48)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SEXT]](s64)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load i48, i48 addrspace(1)* undef, align 8
   ret i48 %val
 }
@@ -269,13 +332,16 @@ define signext i48 @i48_signext_func_void() #0 {
 define zeroext i48 @i48_zeroext_func_void() #0 {
   ; CHECK-LABEL: name: i48_zeroext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s48)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s64)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
+  ; CHECK:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s48)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s64)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load i48, i48 addrspace(1)* undef, align 8
   ret i48 %val
 }
@@ -283,12 +349,15 @@ define zeroext i48 @i48_zeroext_func_void() #0 {
 define i64 @i64_func_void() #0 {
   ; CHECK-LABEL: name: i64_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[DEF]](p1) :: (load (s64) from `i64 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[DEF]](p1) :: (load (s64) from `i64 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load i64, i64 addrspace(1)* undef
   ret i64 %val
 }
@@ -296,14 +365,17 @@ define i64 @i64_func_void() #0 {
 define i65 @i65_func_void() #0 {
   ; CHECK-LABEL: name: i65_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s65) = G_LOAD [[DEF]](p1) :: (load (s65) from `i65 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s96) = G_ANYEXT [[LOAD]](s65)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s96)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s65) = G_LOAD [[DEF]](p1) :: (load (s65) from `i65 addrspace(1)* undef`, align 8, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s96) = G_ANYEXT [[LOAD]](s65)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s96)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %val = load i65, i65 addrspace(1)* undef
   ret i65 %val
 }
@@ -311,14 +383,17 @@ define i65 @i65_func_void() #0 {
 define signext i65 @i65_signext_func_void() #0 {
   ; CHECK-LABEL: name: i65_signext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s65) = G_LOAD [[DEF]](p1) :: (load (s65) from `i65 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s96) = G_SEXT [[LOAD]](s65)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SEXT]](s96)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s65) = G_LOAD [[DEF]](p1) :: (load (s65) from `i65 addrspace(1)* undef`, align 8, addrspace 1)
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s96) = G_SEXT [[LOAD]](s65)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SEXT]](s96)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %val = load i65, i65 addrspace(1)* undef
   ret i65 %val
 }
@@ -326,14 +401,17 @@ define signext i65 @i65_signext_func_void() #0 {
 define zeroext i65 @i65_zeroext_func_void() #0 {
   ; CHECK-LABEL: name: i65_zeroext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s65) = G_LOAD [[DEF]](p1) :: (load (s65) from `i65 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s96) = G_ZEXT [[LOAD]](s65)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s96)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s65) = G_LOAD [[DEF]](p1) :: (load (s65) from `i65 addrspace(1)* undef`, align 8, addrspace 1)
+  ; CHECK:   [[ZEXT:%[0-9]+]]:_(s96) = G_ZEXT [[LOAD]](s65)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s96)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %val = load i65, i65 addrspace(1)* undef
   ret i65 %val
 }
@@ -341,10 +419,13 @@ define zeroext i65 @i65_zeroext_func_void() #0 {
 define float @f32_func_void() #0 {
   ; CHECK-LABEL: name: f32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p1) :: (load (s32) from `float addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[LOAD]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p1) :: (load (s32) from `float addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   $vgpr0 = COPY [[LOAD]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load float, float addrspace(1)* undef
   ret float %val
 }
@@ -352,12 +433,15 @@ define float @f32_func_void() #0 {
 define double @f64_func_void() #0 {
   ; CHECK-LABEL: name: f64_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[DEF]](p1) :: (load (s64) from `double addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[DEF]](p1) :: (load (s64) from `double addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load double, double addrspace(1)* undef
   ret double %val
 }
@@ -365,14 +449,17 @@ define double @f64_func_void() #0 {
 define <2 x double> @v2f64_func_void() #0 {
   ; CHECK-LABEL: name: v2f64_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[DEF]](p1) :: (load (<2 x s64>) from `<2 x double> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[DEF]](p1) :: (load (<2 x s64>) from `<2 x double> addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %val = load <2 x double>, <2 x double> addrspace(1)* undef
   ret <2 x double> %val
 }
@@ -380,12 +467,15 @@ define <2 x double> @v2f64_func_void() #0 {
 define <2 x i32> @v2i32_func_void() #0 {
   ; CHECK-LABEL: name: v2i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[DEF]](p1) :: (load (<2 x s32>) from `<2 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[DEF]](p1) :: (load (<2 x s32>) from `<2 x i32> addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
   ret <2 x i32> %val
 }
@@ -393,13 +483,16 @@ define <2 x i32> @v2i32_func_void() #0 {
 define <3 x i32> @v3i32_func_void() #0 {
   ; CHECK-LABEL: name: v3i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[DEF]](p1) :: (load (<3 x s32>) from `<3 x i32> addrspace(1)* undef`, align 16, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<3 x s32>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[DEF]](p1) :: (load (<3 x s32>) from `<3 x i32> addrspace(1)* undef`, align 16, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<3 x s32>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %val = load <3 x i32>, <3 x i32> addrspace(1)* undef
   ret <3 x i32> %val
 }
@@ -407,14 +500,17 @@ define <3 x i32> @v3i32_func_void() #0 {
 define <4 x i32> @v4i32_func_void() #0 {
   ; CHECK-LABEL: name: v4i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[DEF]](p1) :: (load (<4 x s32>) from `<4 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[DEF]](p1) :: (load (<4 x s32>) from `<4 x i32> addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
   ret <4 x i32> %val
 }
@@ -422,15 +518,18 @@ define <4 x i32> @v4i32_func_void() #0 {
 define <5 x i32> @v5i32_func_void() #0 {
   ; CHECK-LABEL: name: v5i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<5 x s32>) = G_LOAD [[DEF]](p1) :: (volatile load (<5 x s32>) from `<5 x i32> addrspace(1)* undef`, align 32, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<5 x s32>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<5 x s32>) = G_LOAD [[DEF]](p1) :: (volatile load (<5 x s32>) from `<5 x i32> addrspace(1)* undef`, align 32, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<5 x s32>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4
   %val = load volatile <5 x i32>, <5 x i32> addrspace(1)* undef
   ret <5 x i32> %val
 }
@@ -438,19 +537,22 @@ define <5 x i32> @v5i32_func_void() #0 {
 define <8 x i32> @v8i32_func_void() #0 {
   ; CHECK-LABEL: name: v8i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<8 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[LOAD]](p1) :: (load (<8 x s32>) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<8 x s32>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<8 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[LOAD]](p1) :: (load (<8 x s32>) from %ir.ptr, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<8 x s32>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
   %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
   ret <8 x i32> %val
@@ -459,27 +561,30 @@ define <8 x i32> @v8i32_func_void() #0 {
 define <16 x i32> @v16i32_func_void() #0 {
   ; CHECK-LABEL: name: v16i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<16 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[LOAD]](p1) :: (load (<16 x s32>) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s32>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   $vgpr8 = COPY [[UV8]](s32)
-  ; CHECK-NEXT:   $vgpr9 = COPY [[UV9]](s32)
-  ; CHECK-NEXT:   $vgpr10 = COPY [[UV10]](s32)
-  ; CHECK-NEXT:   $vgpr11 = COPY [[UV11]](s32)
-  ; CHECK-NEXT:   $vgpr12 = COPY [[UV12]](s32)
-  ; CHECK-NEXT:   $vgpr13 = COPY [[UV13]](s32)
-  ; CHECK-NEXT:   $vgpr14 = COPY [[UV14]](s32)
-  ; CHECK-NEXT:   $vgpr15 = COPY [[UV15]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<16 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[LOAD]](p1) :: (load (<16 x s32>) from %ir.ptr, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s32>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
+  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
+  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
+  ; CHECK:   $vgpr10 = COPY [[UV10]](s32)
+  ; CHECK:   $vgpr11 = COPY [[UV11]](s32)
+  ; CHECK:   $vgpr12 = COPY [[UV12]](s32)
+  ; CHECK:   $vgpr13 = COPY [[UV13]](s32)
+  ; CHECK:   $vgpr14 = COPY [[UV14]](s32)
+  ; CHECK:   $vgpr15 = COPY [[UV15]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
   %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
   ret <16 x i32> %val
@@ -488,43 +593,46 @@ define <16 x i32> @v16i32_func_void() #0 {
 define <32 x i32> @v32i32_func_void() #0 {
   ; CHECK-LABEL: name: v32i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<32 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: (load (<32 x s32>) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   $vgpr8 = COPY [[UV8]](s32)
-  ; CHECK-NEXT:   $vgpr9 = COPY [[UV9]](s32)
-  ; CHECK-NEXT:   $vgpr10 = COPY [[UV10]](s32)
-  ; CHECK-NEXT:   $vgpr11 = COPY [[UV11]](s32)
-  ; CHECK-NEXT:   $vgpr12 = COPY [[UV12]](s32)
-  ; CHECK-NEXT:   $vgpr13 = COPY [[UV13]](s32)
-  ; CHECK-NEXT:   $vgpr14 = COPY [[UV14]](s32)
-  ; CHECK-NEXT:   $vgpr15 = COPY [[UV15]](s32)
-  ; CHECK-NEXT:   $vgpr16 = COPY [[UV16]](s32)
-  ; CHECK-NEXT:   $vgpr17 = COPY [[UV17]](s32)
-  ; CHECK-NEXT:   $vgpr18 = COPY [[UV18]](s32)
-  ; CHECK-NEXT:   $vgpr19 = COPY [[UV19]](s32)
-  ; CHECK-NEXT:   $vgpr20 = COPY [[UV20]](s32)
-  ; CHECK-NEXT:   $vgpr21 = COPY [[UV21]](s32)
-  ; CHECK-NEXT:   $vgpr22 = COPY [[UV22]](s32)
-  ; CHECK-NEXT:   $vgpr23 = COPY [[UV23]](s32)
-  ; CHECK-NEXT:   $vgpr24 = COPY [[UV24]](s32)
-  ; CHECK-NEXT:   $vgpr25 = COPY [[UV25]](s32)
-  ; CHECK-NEXT:   $vgpr26 = COPY [[UV26]](s32)
-  ; CHECK-NEXT:   $vgpr27 = COPY [[UV27]](s32)
-  ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
-  ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
-  ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[UV31]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<32 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: (load (<32 x s32>) from %ir.ptr, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
+  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
+  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
+  ; CHECK:   $vgpr10 = COPY [[UV10]](s32)
+  ; CHECK:   $vgpr11 = COPY [[UV11]](s32)
+  ; CHECK:   $vgpr12 = COPY [[UV12]](s32)
+  ; CHECK:   $vgpr13 = COPY [[UV13]](s32)
+  ; CHECK:   $vgpr14 = COPY [[UV14]](s32)
+  ; CHECK:   $vgpr15 = COPY [[UV15]](s32)
+  ; CHECK:   $vgpr16 = COPY [[UV16]](s32)
+  ; CHECK:   $vgpr17 = COPY [[UV17]](s32)
+  ; CHECK:   $vgpr18 = COPY [[UV18]](s32)
+  ; CHECK:   $vgpr19 = COPY [[UV19]](s32)
+  ; CHECK:   $vgpr20 = COPY [[UV20]](s32)
+  ; CHECK:   $vgpr21 = COPY [[UV21]](s32)
+  ; CHECK:   $vgpr22 = COPY [[UV22]](s32)
+  ; CHECK:   $vgpr23 = COPY [[UV23]](s32)
+  ; CHECK:   $vgpr24 = COPY [[UV24]](s32)
+  ; CHECK:   $vgpr25 = COPY [[UV25]](s32)
+  ; CHECK:   $vgpr26 = COPY [[UV26]](s32)
+  ; CHECK:   $vgpr27 = COPY [[UV27]](s32)
+  ; CHECK:   $vgpr28 = COPY [[UV28]](s32)
+  ; CHECK:   $vgpr29 = COPY [[UV29]](s32)
+  ; CHECK:   $vgpr30 = COPY [[UV30]](s32)
+  ; CHECK:   $vgpr31 = COPY [[UV31]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
   %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
   ret <32 x i32> %val
@@ -533,14 +641,17 @@ define <32 x i32> @v32i32_func_void() #0 {
 define <2 x i64> @v2i64_func_void() #0 {
   ; CHECK-LABEL: name: v2i64_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[DEF]](p1) :: (load (<2 x s64>) from `<2 x i64> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[DEF]](p1) :: (load (<2 x s64>) from `<2 x i64> addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %val = load <2 x i64>, <2 x i64> addrspace(1)* undef
   ret <2 x i64> %val
 }
@@ -548,17 +659,20 @@ define <2 x i64> @v2i64_func_void() #0 {
 define <3 x i64> @v3i64_func_void() #0 {
   ; CHECK-LABEL: name: v3i64_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<3 x i64> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s64>) = G_LOAD [[LOAD]](p1) :: (load (<3 x s64>) from %ir.ptr, align 32, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s64>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<3 x i64> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<3 x s64>) = G_LOAD [[LOAD]](p1) :: (load (<3 x s64>) from %ir.ptr, align 32, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s64>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
   %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(4)* undef
   %val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr
   ret <3 x i64> %val
@@ -567,19 +681,22 @@ define <3 x i64> @v3i64_func_void() #0 {
 define <4 x i64> @v4i64_func_void() #0 {
   ; CHECK-LABEL: name: v4i64_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<4 x i64> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[LOAD]](p1) :: (load (<4 x s64>) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<4 x s64>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<4 x i64> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[LOAD]](p1) :: (load (<4 x s64>) from %ir.ptr, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<4 x s64>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
   %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(4)* undef
   %val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr
   ret <4 x i64> %val
@@ -588,21 +705,24 @@ define <4 x i64> @v4i64_func_void() #0 {
 define <5 x i64> @v5i64_func_void() #0 {
   ; CHECK-LABEL: name: v5i64_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<5 x i64> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<5 x s64>) = G_LOAD [[LOAD]](p1) :: (load (<5 x s64>) from %ir.ptr, align 64, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<5 x s64>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   $vgpr8 = COPY [[UV8]](s32)
-  ; CHECK-NEXT:   $vgpr9 = COPY [[UV9]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<5 x i64> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<5 x s64>) = G_LOAD [[LOAD]](p1) :: (load (<5 x s64>) from %ir.ptr, align 64, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<5 x s64>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
+  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
+  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9
   %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(4)* undef
   %val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr
   ret <5 x i64> %val
@@ -611,27 +731,30 @@ define <5 x i64> @v5i64_func_void() #0 {
 define <8 x i64> @v8i64_func_void() #0 {
   ; CHECK-LABEL: name: v8i64_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<8 x i64> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s64>) = G_LOAD [[LOAD]](p1) :: (load (<8 x s64>) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<8 x s64>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   $vgpr8 = COPY [[UV8]](s32)
-  ; CHECK-NEXT:   $vgpr9 = COPY [[UV9]](s32)
-  ; CHECK-NEXT:   $vgpr10 = COPY [[UV10]](s32)
-  ; CHECK-NEXT:   $vgpr11 = COPY [[UV11]](s32)
-  ; CHECK-NEXT:   $vgpr12 = COPY [[UV12]](s32)
-  ; CHECK-NEXT:   $vgpr13 = COPY [[UV13]](s32)
-  ; CHECK-NEXT:   $vgpr14 = COPY [[UV14]](s32)
-  ; CHECK-NEXT:   $vgpr15 = COPY [[UV15]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<8 x i64> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<8 x s64>) = G_LOAD [[LOAD]](p1) :: (load (<8 x s64>) from %ir.ptr, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<8 x s64>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
+  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
+  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
+  ; CHECK:   $vgpr10 = COPY [[UV10]](s32)
+  ; CHECK:   $vgpr11 = COPY [[UV11]](s32)
+  ; CHECK:   $vgpr12 = COPY [[UV12]](s32)
+  ; CHECK:   $vgpr13 = COPY [[UV13]](s32)
+  ; CHECK:   $vgpr14 = COPY [[UV14]](s32)
+  ; CHECK:   $vgpr15 = COPY [[UV15]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
   %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(4)* undef
   %val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr
   ret <8 x i64> %val
@@ -640,43 +763,46 @@ define <8 x i64> @v8i64_func_void() #0 {
 define <16 x i64> @v16i64_func_void() #0 {
   ; CHECK-LABEL: name: v16i64_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<16 x i64> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s64>) = G_LOAD [[LOAD]](p1) :: (load (<16 x s64>) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s64>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   $vgpr8 = COPY [[UV8]](s32)
-  ; CHECK-NEXT:   $vgpr9 = COPY [[UV9]](s32)
-  ; CHECK-NEXT:   $vgpr10 = COPY [[UV10]](s32)
-  ; CHECK-NEXT:   $vgpr11 = COPY [[UV11]](s32)
-  ; CHECK-NEXT:   $vgpr12 = COPY [[UV12]](s32)
-  ; CHECK-NEXT:   $vgpr13 = COPY [[UV13]](s32)
-  ; CHECK-NEXT:   $vgpr14 = COPY [[UV14]](s32)
-  ; CHECK-NEXT:   $vgpr15 = COPY [[UV15]](s32)
-  ; CHECK-NEXT:   $vgpr16 = COPY [[UV16]](s32)
-  ; CHECK-NEXT:   $vgpr17 = COPY [[UV17]](s32)
-  ; CHECK-NEXT:   $vgpr18 = COPY [[UV18]](s32)
-  ; CHECK-NEXT:   $vgpr19 = COPY [[UV19]](s32)
-  ; CHECK-NEXT:   $vgpr20 = COPY [[UV20]](s32)
-  ; CHECK-NEXT:   $vgpr21 = COPY [[UV21]](s32)
-  ; CHECK-NEXT:   $vgpr22 = COPY [[UV22]](s32)
-  ; CHECK-NEXT:   $vgpr23 = COPY [[UV23]](s32)
-  ; CHECK-NEXT:   $vgpr24 = COPY [[UV24]](s32)
-  ; CHECK-NEXT:   $vgpr25 = COPY [[UV25]](s32)
-  ; CHECK-NEXT:   $vgpr26 = COPY [[UV26]](s32)
-  ; CHECK-NEXT:   $vgpr27 = COPY [[UV27]](s32)
-  ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
-  ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
-  ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[UV31]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<16 x i64> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<16 x s64>) = G_LOAD [[LOAD]](p1) :: (load (<16 x s64>) from %ir.ptr, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s64>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
+  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
+  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
+  ; CHECK:   $vgpr10 = COPY [[UV10]](s32)
+  ; CHECK:   $vgpr11 = COPY [[UV11]](s32)
+  ; CHECK:   $vgpr12 = COPY [[UV12]](s32)
+  ; CHECK:   $vgpr13 = COPY [[UV13]](s32)
+  ; CHECK:   $vgpr14 = COPY [[UV14]](s32)
+  ; CHECK:   $vgpr15 = COPY [[UV15]](s32)
+  ; CHECK:   $vgpr16 = COPY [[UV16]](s32)
+  ; CHECK:   $vgpr17 = COPY [[UV17]](s32)
+  ; CHECK:   $vgpr18 = COPY [[UV18]](s32)
+  ; CHECK:   $vgpr19 = COPY [[UV19]](s32)
+  ; CHECK:   $vgpr20 = COPY [[UV20]](s32)
+  ; CHECK:   $vgpr21 = COPY [[UV21]](s32)
+  ; CHECK:   $vgpr22 = COPY [[UV22]](s32)
+  ; CHECK:   $vgpr23 = COPY [[UV23]](s32)
+  ; CHECK:   $vgpr24 = COPY [[UV24]](s32)
+  ; CHECK:   $vgpr25 = COPY [[UV25]](s32)
+  ; CHECK:   $vgpr26 = COPY [[UV26]](s32)
+  ; CHECK:   $vgpr27 = COPY [[UV27]](s32)
+  ; CHECK:   $vgpr28 = COPY [[UV28]](s32)
+  ; CHECK:   $vgpr29 = COPY [[UV29]](s32)
+  ; CHECK:   $vgpr30 = COPY [[UV30]](s32)
+  ; CHECK:   $vgpr31 = COPY [[UV31]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
   %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(4)* undef
   %val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr
   ret <16 x i64> %val
@@ -685,10 +811,13 @@ define <16 x i64> @v16i64_func_void() #0 {
 define <2 x i16> @v2i16_func_void() #0 {
   ; CHECK-LABEL: name: v2i16_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: (load (<2 x s16>) from `<2 x i16> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[LOAD]](<2 x s16>)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: (load (<2 x s16>) from `<2 x i16> addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   $vgpr0 = COPY [[LOAD]](<2 x s16>)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load <2 x i16>, <2 x i16> addrspace(1)* undef
   ret <2 x i16> %val
 }
@@ -696,10 +825,13 @@ define <2 x i16> @v2i16_func_void() #0 {
 define <2 x half> @v2f16_func_void() #0 {
   ; CHECK-LABEL: name: v2f16_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: (load (<2 x s16>) from `<2 x half> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[LOAD]](<2 x s16>)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: (load (<2 x s16>) from `<2 x half> addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   $vgpr0 = COPY [[LOAD]](<2 x s16>)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val = load <2 x half>, <2 x half> addrspace(1)* undef
   ret <2 x half> %val
 }
@@ -707,14 +839,17 @@ define <2 x half> @v2f16_func_void() #0 {
 define <3 x i16> @v3i16_func_void() #0 {
   ; CHECK-LABEL: name: v3i16_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: (load (<3 x s16>) from `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[LOAD]](<3 x s16>), [[DEF1]](<3 x s16>)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](<2 x s16>)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: (load (<3 x s16>) from `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1)
+  ; CHECK:   [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[LOAD]](<3 x s16>), [[DEF1]](<3 x s16>)
+  ; CHECK:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](<2 x s16>)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](<2 x s16>)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load <3 x i16>, <3 x i16> addrspace(1)* undef
   ret <3 x i16> %val
 }
@@ -722,12 +857,15 @@ define <3 x i16> @v3i16_func_void() #0 {
 define <4 x i16> @v4i16_func_void() #0 {
   ; CHECK-LABEL: name: v4i16_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[DEF]](p1) :: (load (<4 x s16>) from `<4 x i16> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](<2 x s16>)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[DEF]](p1) :: (load (<4 x s16>) from `<4 x i16> addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](<2 x s16>)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](<2 x s16>)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load <4 x i16>, <4 x i16> addrspace(1)* undef
   ret <4 x i16> %val
 }
@@ -735,12 +873,15 @@ define <4 x i16> @v4i16_func_void() #0 {
 define <4 x half> @v4f16_func_void() #0 {
   ; CHECK-LABEL: name: v4f16_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[DEF]](p1) :: (load (<4 x s16>) from `<4 x half> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](<2 x s16>)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[DEF]](p1) :: (load (<4 x s16>) from `<4 x half> addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](<2 x s16>)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](<2 x s16>)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load <4 x half>, <4 x half> addrspace(1)* undef
   ret <4 x half> %val
 }
@@ -748,16 +889,19 @@ define <4 x half> @v4f16_func_void() #0 {
 define <5 x i16> @v5i16_func_void() #0 {
   ; CHECK-LABEL: name: v5i16_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<5 x i16> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<5 x s16>) = G_LOAD [[LOAD]](p1) :: (load (<5 x s16>) from %ir.ptr, align 16, addrspace 1)
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(<5 x s16>) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[LOAD1]](<5 x s16>), [[DEF1]](<5 x s16>)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<10 x s16>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](<2 x s16>)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<5 x i16> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<5 x s16>) = G_LOAD [[LOAD]](p1) :: (load (<5 x s16>) from %ir.ptr, align 16, addrspace 1)
+  ; CHECK:   [[DEF1:%[0-9]+]]:_(<5 x s16>) = G_IMPLICIT_DEF
+  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[LOAD1]](<5 x s16>), [[DEF1]](<5 x s16>)
+  ; CHECK:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<10 x s16>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](<2 x s16>)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](<2 x s16>)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](<2 x s16>)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef
   %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
   ret <5 x i16> %val
@@ -766,15 +910,18 @@ define <5 x i16> @v5i16_func_void() #0 {
 define <8 x i16> @v8i16_func_void() #0 {
   ; CHECK-LABEL: name: v8i16_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<8 x i16> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[LOAD]](p1) :: (load (<8 x s16>) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD1]](<8 x s16>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](<2 x s16>)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<8 x i16> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[LOAD]](p1) :: (load (<8 x s16>) from %ir.ptr, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD1]](<8 x s16>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](<2 x s16>)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](<2 x s16>)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](<2 x s16>)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](<2 x s16>)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(4)* undef
   %val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
   ret <8 x i16> %val
@@ -783,19 +930,22 @@ define <8 x i16> @v8i16_func_void() #0 {
 define <16 x i16> @v16i16_func_void() #0 {
   ; CHECK-LABEL: name: v16i16_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<16 x i16> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[LOAD]](p1) :: (load (<16 x s16>) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD1]](<16 x s16>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](<2 x s16>)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](<2 x s16>)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<16 x i16> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[LOAD]](p1) :: (load (<16 x s16>) from %ir.ptr, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD1]](<16 x s16>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](<2 x s16>)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](<2 x s16>)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](<2 x s16>)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](<2 x s16>)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](<2 x s16>)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](<2 x s16>)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](<2 x s16>)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](<2 x s16>)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
   %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(4)* undef
   %val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
   ret <16 x i16> %val
@@ -804,59 +954,62 @@ define <16 x i16> @v16i16_func_void() #0 {
 define <16 x i8> @v16i8_func_void() #0 {
   ; CHECK-LABEL: name: v16i8_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<16 x i8> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[LOAD]](p1) :: (load (<16 x s8>) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8), [[UV12:%[0-9]+]]:_(s8), [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<16 x s8>)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
-  ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV1]](s8)
-  ; CHECK-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[UV2]](s8)
-  ; CHECK-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s16) = G_ANYEXT [[UV3]](s8)
-  ; CHECK-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s16) = G_ANYEXT [[UV4]](s8)
-  ; CHECK-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s16) = G_ANYEXT [[UV5]](s8)
-  ; CHECK-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s16) = G_ANYEXT [[UV6]](s8)
-  ; CHECK-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s16) = G_ANYEXT [[UV7]](s8)
-  ; CHECK-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s16) = G_ANYEXT [[UV8]](s8)
-  ; CHECK-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s16) = G_ANYEXT [[UV9]](s8)
-  ; CHECK-NEXT:   [[ANYEXT10:%[0-9]+]]:_(s16) = G_ANYEXT [[UV10]](s8)
-  ; CHECK-NEXT:   [[ANYEXT11:%[0-9]+]]:_(s16) = G_ANYEXT [[UV11]](s8)
-  ; CHECK-NEXT:   [[ANYEXT12:%[0-9]+]]:_(s16) = G_ANYEXT [[UV12]](s8)
-  ; CHECK-NEXT:   [[ANYEXT13:%[0-9]+]]:_(s16) = G_ANYEXT [[UV13]](s8)
-  ; CHECK-NEXT:   [[ANYEXT14:%[0-9]+]]:_(s16) = G_ANYEXT [[UV14]](s8)
-  ; CHECK-NEXT:   [[ANYEXT15:%[0-9]+]]:_(s16) = G_ANYEXT [[UV15]](s8)
-  ; CHECK-NEXT:   [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT16]](s32)
-  ; CHECK-NEXT:   [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT1]](s16)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[ANYEXT17]](s32)
-  ; CHECK-NEXT:   [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT2]](s16)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[ANYEXT18]](s32)
-  ; CHECK-NEXT:   [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT3]](s16)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[ANYEXT19]](s32)
-  ; CHECK-NEXT:   [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT4]](s16)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[ANYEXT20]](s32)
-  ; CHECK-NEXT:   [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT5]](s16)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[ANYEXT21]](s32)
-  ; CHECK-NEXT:   [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT6]](s16)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[ANYEXT22]](s32)
-  ; CHECK-NEXT:   [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT7]](s16)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[ANYEXT23]](s32)
-  ; CHECK-NEXT:   [[ANYEXT24:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT8]](s16)
-  ; CHECK-NEXT:   $vgpr8 = COPY [[ANYEXT24]](s32)
-  ; CHECK-NEXT:   [[ANYEXT25:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT9]](s16)
-  ; CHECK-NEXT:   $vgpr9 = COPY [[ANYEXT25]](s32)
-  ; CHECK-NEXT:   [[ANYEXT26:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT10]](s16)
-  ; CHECK-NEXT:   $vgpr10 = COPY [[ANYEXT26]](s32)
-  ; CHECK-NEXT:   [[ANYEXT27:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT11]](s16)
-  ; CHECK-NEXT:   $vgpr11 = COPY [[ANYEXT27]](s32)
-  ; CHECK-NEXT:   [[ANYEXT28:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT12]](s16)
-  ; CHECK-NEXT:   $vgpr12 = COPY [[ANYEXT28]](s32)
-  ; CHECK-NEXT:   [[ANYEXT29:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT13]](s16)
-  ; CHECK-NEXT:   $vgpr13 = COPY [[ANYEXT29]](s32)
-  ; CHECK-NEXT:   [[ANYEXT30:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT14]](s16)
-  ; CHECK-NEXT:   $vgpr14 = COPY [[ANYEXT30]](s32)
-  ; CHECK-NEXT:   [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT15]](s16)
-  ; CHECK-NEXT:   $vgpr15 = COPY [[ANYEXT31]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<16 x i8> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[LOAD]](p1) :: (load (<16 x s8>) from %ir.ptr, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8), [[UV12:%[0-9]+]]:_(s8), [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<16 x s8>)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
+  ; CHECK:   [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV1]](s8)
+  ; CHECK:   [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[UV2]](s8)
+  ; CHECK:   [[ANYEXT3:%[0-9]+]]:_(s16) = G_ANYEXT [[UV3]](s8)
+  ; CHECK:   [[ANYEXT4:%[0-9]+]]:_(s16) = G_ANYEXT [[UV4]](s8)
+  ; CHECK:   [[ANYEXT5:%[0-9]+]]:_(s16) = G_ANYEXT [[UV5]](s8)
+  ; CHECK:   [[ANYEXT6:%[0-9]+]]:_(s16) = G_ANYEXT [[UV6]](s8)
+  ; CHECK:   [[ANYEXT7:%[0-9]+]]:_(s16) = G_ANYEXT [[UV7]](s8)
+  ; CHECK:   [[ANYEXT8:%[0-9]+]]:_(s16) = G_ANYEXT [[UV8]](s8)
+  ; CHECK:   [[ANYEXT9:%[0-9]+]]:_(s16) = G_ANYEXT [[UV9]](s8)
+  ; CHECK:   [[ANYEXT10:%[0-9]+]]:_(s16) = G_ANYEXT [[UV10]](s8)
+  ; CHECK:   [[ANYEXT11:%[0-9]+]]:_(s16) = G_ANYEXT [[UV11]](s8)
+  ; CHECK:   [[ANYEXT12:%[0-9]+]]:_(s16) = G_ANYEXT [[UV12]](s8)
+  ; CHECK:   [[ANYEXT13:%[0-9]+]]:_(s16) = G_ANYEXT [[UV13]](s8)
+  ; CHECK:   [[ANYEXT14:%[0-9]+]]:_(s16) = G_ANYEXT [[UV14]](s8)
+  ; CHECK:   [[ANYEXT15:%[0-9]+]]:_(s16) = G_ANYEXT [[UV15]](s8)
+  ; CHECK:   [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT16]](s32)
+  ; CHECK:   [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT1]](s16)
+  ; CHECK:   $vgpr1 = COPY [[ANYEXT17]](s32)
+  ; CHECK:   [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT2]](s16)
+  ; CHECK:   $vgpr2 = COPY [[ANYEXT18]](s32)
+  ; CHECK:   [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT3]](s16)
+  ; CHECK:   $vgpr3 = COPY [[ANYEXT19]](s32)
+  ; CHECK:   [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT4]](s16)
+  ; CHECK:   $vgpr4 = COPY [[ANYEXT20]](s32)
+  ; CHECK:   [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT5]](s16)
+  ; CHECK:   $vgpr5 = COPY [[ANYEXT21]](s32)
+  ; CHECK:   [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT6]](s16)
+  ; CHECK:   $vgpr6 = COPY [[ANYEXT22]](s32)
+  ; CHECK:   [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT7]](s16)
+  ; CHECK:   $vgpr7 = COPY [[ANYEXT23]](s32)
+  ; CHECK:   [[ANYEXT24:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT8]](s16)
+  ; CHECK:   $vgpr8 = COPY [[ANYEXT24]](s32)
+  ; CHECK:   [[ANYEXT25:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT9]](s16)
+  ; CHECK:   $vgpr9 = COPY [[ANYEXT25]](s32)
+  ; CHECK:   [[ANYEXT26:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT10]](s16)
+  ; CHECK:   $vgpr10 = COPY [[ANYEXT26]](s32)
+  ; CHECK:   [[ANYEXT27:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT11]](s16)
+  ; CHECK:   $vgpr11 = COPY [[ANYEXT27]](s32)
+  ; CHECK:   [[ANYEXT28:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT12]](s16)
+  ; CHECK:   $vgpr12 = COPY [[ANYEXT28]](s32)
+  ; CHECK:   [[ANYEXT29:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT13]](s16)
+  ; CHECK:   $vgpr13 = COPY [[ANYEXT29]](s32)
+  ; CHECK:   [[ANYEXT30:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT14]](s16)
+  ; CHECK:   $vgpr14 = COPY [[ANYEXT30]](s32)
+  ; CHECK:   [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT15]](s16)
+  ; CHECK:   $vgpr15 = COPY [[ANYEXT31]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
   %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef
   %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
   ret <16 x i8> %val
@@ -865,16 +1018,19 @@ define <16 x i8> @v16i8_func_void() #0 {
 define <2 x i8> @v2i8_func_void() #0 {
   ; CHECK-LABEL: name: v2i8_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[DEF]](p1) :: (load (<2 x s8>) from `<2 x i8> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD]](<2 x s8>)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
-  ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV1]](s8)
-  ; CHECK-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT2]](s32)
-  ; CHECK-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT1]](s16)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[ANYEXT3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[DEF]](p1) :: (load (<2 x s8>) from `<2 x i8> addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD]](<2 x s8>)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
+  ; CHECK:   [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV1]](s8)
+  ; CHECK:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT2]](s32)
+  ; CHECK:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT1]](s16)
+  ; CHECK:   $vgpr1 = COPY [[ANYEXT3]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load <2 x i8>, <2 x i8> addrspace(1)* undef
   ret <2 x i8> %val
 }
@@ -882,19 +1038,22 @@ define <2 x i8> @v2i8_func_void() #0 {
 define <3 x i8> @v3i8_func_void() #0 {
   ; CHECK-LABEL: name: v3i8_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[DEF]](p1) :: (load (<3 x s8>) from `<3 x i8> addrspace(1)* undef`, align 4, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD]](<3 x s8>)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
-  ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV1]](s8)
-  ; CHECK-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[UV2]](s8)
-  ; CHECK-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT3]](s32)
-  ; CHECK-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT1]](s16)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[ANYEXT4]](s32)
-  ; CHECK-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT2]](s16)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[ANYEXT5]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[DEF]](p1) :: (load (<3 x s8>) from `<3 x i8> addrspace(1)* undef`, align 4, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD]](<3 x s8>)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
+  ; CHECK:   [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV1]](s8)
+  ; CHECK:   [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[UV2]](s8)
+  ; CHECK:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT3]](s32)
+  ; CHECK:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT1]](s16)
+  ; CHECK:   $vgpr1 = COPY [[ANYEXT4]](s32)
+  ; CHECK:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT2]](s16)
+  ; CHECK:   $vgpr2 = COPY [[ANYEXT5]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
   %val = load <3 x i8>, <3 x i8> addrspace(1)* undef
   ret <3 x i8> %val
 }
@@ -902,23 +1061,26 @@ define <3 x i8> @v3i8_func_void() #0 {
 define <4  x i8> @v4i8_func_void() #0 {
   ; CHECK-LABEL: name: v4i8_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<4 x i8> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[LOAD]](p1) :: (load (<4 x s8>) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<4 x s8>)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
-  ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV1]](s8)
-  ; CHECK-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[UV2]](s8)
-  ; CHECK-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s16) = G_ANYEXT [[UV3]](s8)
-  ; CHECK-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT4]](s32)
-  ; CHECK-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT1]](s16)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[ANYEXT5]](s32)
-  ; CHECK-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT2]](s16)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[ANYEXT6]](s32)
-  ; CHECK-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT3]](s16)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[ANYEXT7]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<4 x i8> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[LOAD]](p1) :: (load (<4 x s8>) from %ir.ptr, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<4 x s8>)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8)
+  ; CHECK:   [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV1]](s8)
+  ; CHECK:   [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[UV2]](s8)
+  ; CHECK:   [[ANYEXT3:%[0-9]+]]:_(s16) = G_ANYEXT [[UV3]](s8)
+  ; CHECK:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT4]](s32)
+  ; CHECK:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT1]](s16)
+  ; CHECK:   $vgpr1 = COPY [[ANYEXT5]](s32)
+  ; CHECK:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT2]](s16)
+  ; CHECK:   $vgpr2 = COPY [[ANYEXT6]](s32)
+  ; CHECK:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT3]](s16)
+  ; CHECK:   $vgpr3 = COPY [[ANYEXT7]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %ptr = load volatile <4  x i8> addrspace(1)*, <4  x i8> addrspace(1)* addrspace(4)* undef
   %val = load <4  x i8>, <4  x i8> addrspace(1)* %ptr
   ret <4  x i8> %val
@@ -927,15 +1089,18 @@ define <4  x i8> @v4i8_func_void() #0 {
 define {i8, i32} @struct_i8_i32_func_void() #0 {
   ; CHECK-LABEL: name: struct_i8_i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (load (s8) from `{ i8, i32 } addrspace(1)* undef`, align 4, addrspace 1)
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C]](s64)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32) from `{ i8, i32 } addrspace(1)* undef` + 4, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s8)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[LOAD1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (load (s8) from `{ i8, i32 } addrspace(1)* undef`, align 4, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C]](s64)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32) from `{ i8, i32 } addrspace(1)* undef` + 4, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s8)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   $vgpr1 = COPY [[LOAD1]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* undef
   ret { i8, i32 } %val
 }
@@ -943,18 +1108,19 @@ define {i8, i32} @struct_i8_i32_func_void() #0 {
 define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret({ i8, i32 }) %arg0) #0 {
   ; CHECK-LABEL: name: void_func_sret_struct_i8_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p1) :: (volatile load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-  ; CHECK-NEXT:   G_STORE [[LOAD]](s8), [[COPY]](p5) :: (store (s8) into %ir.gep01, addrspace 5)
-  ; CHECK-NEXT:   G_STORE [[LOAD1]](s32), [[PTR_ADD]](p5) :: (store (s32) into %ir.gep1, addrspace 5)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY2]](p1) :: (volatile load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+  ; CHECK:   G_STORE [[LOAD]](s8), [[COPY]](p5) :: (store (s8) into %ir.gep01, addrspace 5)
+  ; CHECK:   G_STORE [[LOAD1]](s32), [[PTR_ADD]](p5) :: (store (s32) into %ir.gep1, addrspace 5)
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]]
   %val0 = load volatile i8, i8 addrspace(1)* undef
   %val1 = load volatile i32, i32 addrspace(1)* undef
   %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0
@@ -971,14 +1137,15 @@ define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret({ i8, i
 define <33 x i32> @v33i32_func_void() #0 {
   ; CHECK-LABEL: name: v33i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<33 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<33 x s32>) = G_LOAD [[LOAD]](p1) :: (load (<33 x s32>) from %ir.ptr, align 256, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD1]](<33 x s32>), [[COPY]](p5) :: (store (<33 x s32>), align 256, addrspace 5)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<33 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<33 x s32>) = G_LOAD [[LOAD]](p1) :: (load (<33 x s32>) from %ir.ptr, align 256, addrspace 1)
+  ; CHECK:   G_STORE [[LOAD1]](<33 x s32>), [[COPY]](p5) :: (store (<33 x s32>), align 256, addrspace 5)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]]
   %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr
   ret <33 x i32> %val
@@ -987,21 +1154,22 @@ define <33 x i32> @v33i32_func_void() #0 {
 define <33 x i32> @v33i32_func_v33i32_i32(<33 x i32> addrspace(1)* %p, i32 %idx) #0 {
   ; CHECK-LABEL: name: v33i32_func_v33i32_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY3]](s32)
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
-  ; CHECK-NEXT:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]]
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[MUL]](s64)
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(p1) = COPY [[PTR_ADD]](p1)
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<33 x s32>) = G_LOAD [[COPY4]](p1) :: (load (<33 x s32>) from %ir.gep, align 256, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD]](<33 x s32>), [[COPY]](p5) :: (store (<33 x s32>), align 256, addrspace 5)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY3]](s32)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
+  ; CHECK:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]]
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[MUL]](s64)
+  ; CHECK:   [[COPY5:%[0-9]+]]:_(p1) = COPY [[PTR_ADD]](p1)
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<33 x s32>) = G_LOAD [[COPY5]](p1) :: (load (<33 x s32>) from %ir.gep, align 256, addrspace 1)
+  ; CHECK:   G_STORE [[LOAD]](<33 x s32>), [[COPY]](p5) :: (store (<33 x s32>), align 256, addrspace 5)
+  ; CHECK:   [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK:   S_SETPC_B64_return [[COPY6]]
   %gep = getelementptr inbounds <33 x i32>, <33 x i32> addrspace(1)* %p, i32 %idx
   %val = load <33 x i32>, <33 x i32> addrspace(1)* %gep
   ret <33 x i32> %val
@@ -1010,20 +1178,21 @@ define <33 x i32> @v33i32_func_v33i32_i32(<33 x i32> addrspace(1)* %p, i32 %idx)
 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
   ; CHECK-LABEL: name: struct_v32i32_i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `{ <32 x i32>, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: (load (<32 x s32>) from %ir.ptr, addrspace 1)
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
-  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32) from %ir.ptr + 128, align 128, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD1]](<32 x s32>), [[COPY]](p5) :: (store (<32 x s32>), addrspace 5)
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
-  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-  ; CHECK-NEXT:   G_STORE [[LOAD2]](s32), [[PTR_ADD1]](p5) :: (store (s32), align 128, addrspace 5)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `{ <32 x i32>, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: (load (<32 x s32>) from %ir.ptr, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32) from %ir.ptr + 128, align 128, addrspace 1)
+  ; CHECK:   G_STORE [[LOAD1]](<32 x s32>), [[COPY]](p5) :: (store (<32 x s32>), addrspace 5)
+  ; CHECK:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+  ; CHECK:   G_STORE [[LOAD2]](s32), [[PTR_ADD1]](p5) :: (store (s32), align 128, addrspace 5)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]]
   %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr
   ret { <32 x i32>, i32 }%val
@@ -1032,20 +1201,21 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
   ; CHECK-LABEL: name: struct_i32_v32i32_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `{ i32, <32 x i32> } addrspace(1)* addrspace(4)* undef`, addrspace 4)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[LOAD]](p1) :: (load (s32) from %ir.ptr, align 128, addrspace 1)
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
-  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<32 x s32>) from %ir.ptr + 128, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD1]](s32), [[COPY]](p5) :: (store (s32), align 128, addrspace 5)
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
-  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-  ; CHECK-NEXT:   G_STORE [[LOAD2]](<32 x s32>), [[PTR_ADD1]](p5) :: (store (<32 x s32>), addrspace 5)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `{ i32, <32 x i32> } addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[LOAD]](p1) :: (load (s32) from %ir.ptr, align 128, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<32 x s32>) from %ir.ptr + 128, addrspace 1)
+  ; CHECK:   G_STORE [[LOAD1]](s32), [[COPY]](p5) :: (store (s32), align 128, addrspace 5)
+  ; CHECK:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+  ; CHECK:   G_STORE [[LOAD2]](<32 x s32>), [[PTR_ADD1]](p5) :: (store (<32 x s32>), addrspace 5)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]]
   %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(4)* undef
   %val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr
   ret { i32, <32 x i32> }%val
@@ -1055,25 +1225,28 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 {
   ; CHECK-LABEL: name: v3i32_struct_func_void_wasted_reg
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `i32 addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `i32 addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `i32 addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `i32 addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   [[IVEC:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[DEF1]], [[LOAD]](s32), [[C]](s32)
-  ; CHECK-NEXT:   [[IVEC1:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[LOAD1]](s32), [[C1]](s32)
-  ; CHECK-NEXT:   [[IVEC2:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], [[LOAD2]](s32), [[C2]](s32)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[IVEC2]](<3 x s32>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[LOAD3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
+  ; CHECK:   [[DEF1:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `i32 addrspace(3)* undef`, addrspace 3)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `i32 addrspace(3)* undef`, addrspace 3)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `i32 addrspace(3)* undef`, addrspace 3)
+  ; CHECK:   [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `i32 addrspace(3)* undef`, addrspace 3)
+  ; CHECK:   [[IVEC:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[DEF1]], [[LOAD]](s32), [[C]](s32)
+  ; CHECK:   [[IVEC1:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[LOAD1]](s32), [[C1]](s32)
+  ; CHECK:   [[IVEC2:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], [[LOAD2]](s32), [[C2]](s32)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[IVEC2]](<3 x s32>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[LOAD3]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %load0 = load volatile i32, i32 addrspace(3)* undef
   %load1 = load volatile i32, i32 addrspace(3)* undef
   %load2 = load volatile i32, i32 addrspace(3)* undef
@@ -1090,26 +1263,29 @@ define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 {
 define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 {
   ; CHECK-LABEL: name: v3f32_struct_func_void_wasted_reg
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY [[DEF]](p3)
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `float addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `float addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `float addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (volatile load (s32) from `i32 addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   [[IVEC:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[DEF1]], [[LOAD]](s32), [[C]](s32)
-  ; CHECK-NEXT:   [[IVEC1:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[LOAD1]](s32), [[C1]](s32)
-  ; CHECK-NEXT:   [[IVEC2:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], [[LOAD2]](s32), [[C2]](s32)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[IVEC2]](<3 x s32>)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[LOAD3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(p3) = COPY [[DEF]](p3)
+  ; CHECK:   [[DEF1:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK:   [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `float addrspace(3)* undef`, addrspace 3)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `float addrspace(3)* undef`, addrspace 3)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p3) :: (volatile load (s32) from `float addrspace(3)* undef`, addrspace 3)
+  ; CHECK:   [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p3) :: (volatile load (s32) from `i32 addrspace(3)* undef`, addrspace 3)
+  ; CHECK:   [[IVEC:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[DEF1]], [[LOAD]](s32), [[C]](s32)
+  ; CHECK:   [[IVEC1:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[LOAD1]](s32), [[C1]](s32)
+  ; CHECK:   [[IVEC2:%[0-9]+]]:_(<3 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], [[LOAD2]](s32), [[C2]](s32)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[IVEC2]](<3 x s32>)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[LOAD3]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %load0 = load volatile float, float addrspace(3)* undef
   %load1 = load volatile float, float addrspace(3)* undef
   %load2 = load volatile float, float addrspace(3)* undef
@@ -1126,21 +1302,22 @@ define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 {
 define void @void_func_sret_max_known_zero_bits(i8 addrspace(5)* sret(i8) %arg0) #0 {
   ; CHECK-LABEL: name: void_func_sret_max_known_zero_bits
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
-  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 18
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5)
-  ; CHECK-NEXT:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C]](s32)
-  ; CHECK-NEXT:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C1]](s32)
-  ; CHECK-NEXT:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C2]](s32)
-  ; CHECK-NEXT:   G_STORE [[LSHR]](s32), [[DEF]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   G_STORE [[LSHR1]](s32), [[DEF]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   G_STORE [[LSHR2]](s32), [[DEF]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; CHECK:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
+  ; CHECK:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 18
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
+  ; CHECK:   [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5)
+  ; CHECK:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C]](s32)
+  ; CHECK:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C1]](s32)
+  ; CHECK:   [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C2]](s32)
+  ; CHECK:   G_STORE [[LSHR]](s32), [[DEF]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
+  ; CHECK:   G_STORE [[LSHR1]](s32), [[DEF]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
+  ; CHECK:   G_STORE [[LSHR2]](s32), [[DEF]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]]
   %arg0.int = ptrtoint i8 addrspace(5)* %arg0 to i32
 
   %lshr0 = lshr i32 %arg0.int, 16
@@ -1156,43 +1333,46 @@ define void @void_func_sret_max_known_zero_bits(i8 addrspace(5)* sret(i8) %arg0)
 define i1022 @i1022_func_void() #0 {
   ; CHECK-LABEL: name: i1022_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1022) = G_LOAD [[DEF]](p1) :: (load (s1022) from `i1022 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s1024) = G_ANYEXT [[LOAD]](s1022)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s1024)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   $vgpr8 = COPY [[UV8]](s32)
-  ; CHECK-NEXT:   $vgpr9 = COPY [[UV9]](s32)
-  ; CHECK-NEXT:   $vgpr10 = COPY [[UV10]](s32)
-  ; CHECK-NEXT:   $vgpr11 = COPY [[UV11]](s32)
-  ; CHECK-NEXT:   $vgpr12 = COPY [[UV12]](s32)
-  ; CHECK-NEXT:   $vgpr13 = COPY [[UV13]](s32)
-  ; CHECK-NEXT:   $vgpr14 = COPY [[UV14]](s32)
-  ; CHECK-NEXT:   $vgpr15 = COPY [[UV15]](s32)
-  ; CHECK-NEXT:   $vgpr16 = COPY [[UV16]](s32)
-  ; CHECK-NEXT:   $vgpr17 = COPY [[UV17]](s32)
-  ; CHECK-NEXT:   $vgpr18 = COPY [[UV18]](s32)
-  ; CHECK-NEXT:   $vgpr19 = COPY [[UV19]](s32)
-  ; CHECK-NEXT:   $vgpr20 = COPY [[UV20]](s32)
-  ; CHECK-NEXT:   $vgpr21 = COPY [[UV21]](s32)
-  ; CHECK-NEXT:   $vgpr22 = COPY [[UV22]](s32)
-  ; CHECK-NEXT:   $vgpr23 = COPY [[UV23]](s32)
-  ; CHECK-NEXT:   $vgpr24 = COPY [[UV24]](s32)
-  ; CHECK-NEXT:   $vgpr25 = COPY [[UV25]](s32)
-  ; CHECK-NEXT:   $vgpr26 = COPY [[UV26]](s32)
-  ; CHECK-NEXT:   $vgpr27 = COPY [[UV27]](s32)
-  ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
-  ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
-  ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[UV31]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s1022) = G_LOAD [[DEF]](p1) :: (load (s1022) from `i1022 addrspace(1)* undef`, align 8, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s1024) = G_ANYEXT [[LOAD]](s1022)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s1024)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
+  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
+  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
+  ; CHECK:   $vgpr10 = COPY [[UV10]](s32)
+  ; CHECK:   $vgpr11 = COPY [[UV11]](s32)
+  ; CHECK:   $vgpr12 = COPY [[UV12]](s32)
+  ; CHECK:   $vgpr13 = COPY [[UV13]](s32)
+  ; CHECK:   $vgpr14 = COPY [[UV14]](s32)
+  ; CHECK:   $vgpr15 = COPY [[UV15]](s32)
+  ; CHECK:   $vgpr16 = COPY [[UV16]](s32)
+  ; CHECK:   $vgpr17 = COPY [[UV17]](s32)
+  ; CHECK:   $vgpr18 = COPY [[UV18]](s32)
+  ; CHECK:   $vgpr19 = COPY [[UV19]](s32)
+  ; CHECK:   $vgpr20 = COPY [[UV20]](s32)
+  ; CHECK:   $vgpr21 = COPY [[UV21]](s32)
+  ; CHECK:   $vgpr22 = COPY [[UV22]](s32)
+  ; CHECK:   $vgpr23 = COPY [[UV23]](s32)
+  ; CHECK:   $vgpr24 = COPY [[UV24]](s32)
+  ; CHECK:   $vgpr25 = COPY [[UV25]](s32)
+  ; CHECK:   $vgpr26 = COPY [[UV26]](s32)
+  ; CHECK:   $vgpr27 = COPY [[UV27]](s32)
+  ; CHECK:   $vgpr28 = COPY [[UV28]](s32)
+  ; CHECK:   $vgpr29 = COPY [[UV29]](s32)
+  ; CHECK:   $vgpr30 = COPY [[UV30]](s32)
+  ; CHECK:   $vgpr31 = COPY [[UV31]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
   %val = load i1022, i1022 addrspace(1)* undef
   ret i1022 %val
 }
@@ -1200,43 +1380,46 @@ define i1022 @i1022_func_void() #0 {
 define signext i1022 @i1022_signext_func_void() #0 {
   ; CHECK-LABEL: name: i1022_signext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1022) = G_LOAD [[DEF]](p1) :: (load (s1022) from `i1022 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s1024) = G_SEXT [[LOAD]](s1022)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SEXT]](s1024)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   $vgpr8 = COPY [[UV8]](s32)
-  ; CHECK-NEXT:   $vgpr9 = COPY [[UV9]](s32)
-  ; CHECK-NEXT:   $vgpr10 = COPY [[UV10]](s32)
-  ; CHECK-NEXT:   $vgpr11 = COPY [[UV11]](s32)
-  ; CHECK-NEXT:   $vgpr12 = COPY [[UV12]](s32)
-  ; CHECK-NEXT:   $vgpr13 = COPY [[UV13]](s32)
-  ; CHECK-NEXT:   $vgpr14 = COPY [[UV14]](s32)
-  ; CHECK-NEXT:   $vgpr15 = COPY [[UV15]](s32)
-  ; CHECK-NEXT:   $vgpr16 = COPY [[UV16]](s32)
-  ; CHECK-NEXT:   $vgpr17 = COPY [[UV17]](s32)
-  ; CHECK-NEXT:   $vgpr18 = COPY [[UV18]](s32)
-  ; CHECK-NEXT:   $vgpr19 = COPY [[UV19]](s32)
-  ; CHECK-NEXT:   $vgpr20 = COPY [[UV20]](s32)
-  ; CHECK-NEXT:   $vgpr21 = COPY [[UV21]](s32)
-  ; CHECK-NEXT:   $vgpr22 = COPY [[UV22]](s32)
-  ; CHECK-NEXT:   $vgpr23 = COPY [[UV23]](s32)
-  ; CHECK-NEXT:   $vgpr24 = COPY [[UV24]](s32)
-  ; CHECK-NEXT:   $vgpr25 = COPY [[UV25]](s32)
-  ; CHECK-NEXT:   $vgpr26 = COPY [[UV26]](s32)
-  ; CHECK-NEXT:   $vgpr27 = COPY [[UV27]](s32)
-  ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
-  ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
-  ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[UV31]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s1022) = G_LOAD [[DEF]](p1) :: (load (s1022) from `i1022 addrspace(1)* undef`, align 8, addrspace 1)
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s1024) = G_SEXT [[LOAD]](s1022)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SEXT]](s1024)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
+  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
+  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
+  ; CHECK:   $vgpr10 = COPY [[UV10]](s32)
+  ; CHECK:   $vgpr11 = COPY [[UV11]](s32)
+  ; CHECK:   $vgpr12 = COPY [[UV12]](s32)
+  ; CHECK:   $vgpr13 = COPY [[UV13]](s32)
+  ; CHECK:   $vgpr14 = COPY [[UV14]](s32)
+  ; CHECK:   $vgpr15 = COPY [[UV15]](s32)
+  ; CHECK:   $vgpr16 = COPY [[UV16]](s32)
+  ; CHECK:   $vgpr17 = COPY [[UV17]](s32)
+  ; CHECK:   $vgpr18 = COPY [[UV18]](s32)
+  ; CHECK:   $vgpr19 = COPY [[UV19]](s32)
+  ; CHECK:   $vgpr20 = COPY [[UV20]](s32)
+  ; CHECK:   $vgpr21 = COPY [[UV21]](s32)
+  ; CHECK:   $vgpr22 = COPY [[UV22]](s32)
+  ; CHECK:   $vgpr23 = COPY [[UV23]](s32)
+  ; CHECK:   $vgpr24 = COPY [[UV24]](s32)
+  ; CHECK:   $vgpr25 = COPY [[UV25]](s32)
+  ; CHECK:   $vgpr26 = COPY [[UV26]](s32)
+  ; CHECK:   $vgpr27 = COPY [[UV27]](s32)
+  ; CHECK:   $vgpr28 = COPY [[UV28]](s32)
+  ; CHECK:   $vgpr29 = COPY [[UV29]](s32)
+  ; CHECK:   $vgpr30 = COPY [[UV30]](s32)
+  ; CHECK:   $vgpr31 = COPY [[UV31]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
   %val = load i1022, i1022 addrspace(1)* undef
   ret i1022 %val
 }
@@ -1244,43 +1427,46 @@ define signext i1022 @i1022_signext_func_void() #0 {
 define zeroext i1022 @i1022_zeroext_func_void() #0 {
   ; CHECK-LABEL: name: i1022_zeroext_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s1022) = G_LOAD [[DEF]](p1) :: (load (s1022) from `i1022 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s1024) = G_ZEXT [[LOAD]](s1022)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s1024)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
-  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
-  ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   $vgpr4 = COPY [[UV4]](s32)
-  ; CHECK-NEXT:   $vgpr5 = COPY [[UV5]](s32)
-  ; CHECK-NEXT:   $vgpr6 = COPY [[UV6]](s32)
-  ; CHECK-NEXT:   $vgpr7 = COPY [[UV7]](s32)
-  ; CHECK-NEXT:   $vgpr8 = COPY [[UV8]](s32)
-  ; CHECK-NEXT:   $vgpr9 = COPY [[UV9]](s32)
-  ; CHECK-NEXT:   $vgpr10 = COPY [[UV10]](s32)
-  ; CHECK-NEXT:   $vgpr11 = COPY [[UV11]](s32)
-  ; CHECK-NEXT:   $vgpr12 = COPY [[UV12]](s32)
-  ; CHECK-NEXT:   $vgpr13 = COPY [[UV13]](s32)
-  ; CHECK-NEXT:   $vgpr14 = COPY [[UV14]](s32)
-  ; CHECK-NEXT:   $vgpr15 = COPY [[UV15]](s32)
-  ; CHECK-NEXT:   $vgpr16 = COPY [[UV16]](s32)
-  ; CHECK-NEXT:   $vgpr17 = COPY [[UV17]](s32)
-  ; CHECK-NEXT:   $vgpr18 = COPY [[UV18]](s32)
-  ; CHECK-NEXT:   $vgpr19 = COPY [[UV19]](s32)
-  ; CHECK-NEXT:   $vgpr20 = COPY [[UV20]](s32)
-  ; CHECK-NEXT:   $vgpr21 = COPY [[UV21]](s32)
-  ; CHECK-NEXT:   $vgpr22 = COPY [[UV22]](s32)
-  ; CHECK-NEXT:   $vgpr23 = COPY [[UV23]](s32)
-  ; CHECK-NEXT:   $vgpr24 = COPY [[UV24]](s32)
-  ; CHECK-NEXT:   $vgpr25 = COPY [[UV25]](s32)
-  ; CHECK-NEXT:   $vgpr26 = COPY [[UV26]](s32)
-  ; CHECK-NEXT:   $vgpr27 = COPY [[UV27]](s32)
-  ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
-  ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
-  ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[UV31]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s1022) = G_LOAD [[DEF]](p1) :: (load (s1022) from `i1022 addrspace(1)* undef`, align 8, addrspace 1)
+  ; CHECK:   [[ZEXT:%[0-9]+]]:_(s1024) = G_ZEXT [[LOAD]](s1022)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s1024)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
+  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
+  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
+  ; CHECK:   $vgpr10 = COPY [[UV10]](s32)
+  ; CHECK:   $vgpr11 = COPY [[UV11]](s32)
+  ; CHECK:   $vgpr12 = COPY [[UV12]](s32)
+  ; CHECK:   $vgpr13 = COPY [[UV13]](s32)
+  ; CHECK:   $vgpr14 = COPY [[UV14]](s32)
+  ; CHECK:   $vgpr15 = COPY [[UV15]](s32)
+  ; CHECK:   $vgpr16 = COPY [[UV16]](s32)
+  ; CHECK:   $vgpr17 = COPY [[UV17]](s32)
+  ; CHECK:   $vgpr18 = COPY [[UV18]](s32)
+  ; CHECK:   $vgpr19 = COPY [[UV19]](s32)
+  ; CHECK:   $vgpr20 = COPY [[UV20]](s32)
+  ; CHECK:   $vgpr21 = COPY [[UV21]](s32)
+  ; CHECK:   $vgpr22 = COPY [[UV22]](s32)
+  ; CHECK:   $vgpr23 = COPY [[UV23]](s32)
+  ; CHECK:   $vgpr24 = COPY [[UV24]](s32)
+  ; CHECK:   $vgpr25 = COPY [[UV25]](s32)
+  ; CHECK:   $vgpr26 = COPY [[UV26]](s32)
+  ; CHECK:   $vgpr27 = COPY [[UV27]](s32)
+  ; CHECK:   $vgpr28 = COPY [[UV28]](s32)
+  ; CHECK:   $vgpr29 = COPY [[UV29]](s32)
+  ; CHECK:   $vgpr30 = COPY [[UV30]](s32)
+  ; CHECK:   $vgpr31 = COPY [[UV31]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
   %val = load i1022, i1022 addrspace(1)* undef
   ret i1022 %val
 }
@@ -1290,31 +1476,32 @@ define zeroext i1022 @i1022_zeroext_func_void() #0 {
 define %struct.with.ptrs @ptr_in_struct_func_void() #0 {
   ; CHECK-LABEL: name: ptr_in_struct_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[DEF]](p1) :: (volatile load (<32 x s32>) from `%struct.with.ptrs addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C]](s64)
-  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p1) :: (volatile load (p3) from `%struct.with.ptrs addrspace(1)* undef` + 128, align 128, addrspace 1)
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 136
-  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C1]](s64)
-  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD1]](p1) :: (volatile load (p1) from `%struct.with.ptrs addrspace(1)* undef` + 136, addrspace 1)
-  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 144
-  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
-  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD2]](p1) :: (volatile load (<2 x p1>) from `%struct.with.ptrs addrspace(1)* undef` + 144, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD]](<32 x s32>), [[COPY]](p5) :: (store (<32 x s32>), addrspace 5)
-  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
-  ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-  ; CHECK-NEXT:   G_STORE [[LOAD1]](p3), [[PTR_ADD3]](p5) :: (store (p3), align 128, addrspace 5)
-  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 136
-  ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-  ; CHECK-NEXT:   G_STORE [[LOAD2]](p1), [[PTR_ADD4]](p5) :: (store (p1), addrspace 5)
-  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 144
-  ; CHECK-NEXT:   [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-  ; CHECK-NEXT:   G_STORE [[LOAD3]](<2 x p1>), [[PTR_ADD5]](p5) :: (store (<2 x p1>), addrspace 5)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[DEF]](p1) :: (volatile load (<32 x s32>) from `%struct.with.ptrs addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C]](s64)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p1) :: (volatile load (p3) from `%struct.with.ptrs addrspace(1)* undef` + 128, align 128, addrspace 1)
+  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 136
+  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C1]](s64)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD1]](p1) :: (volatile load (p1) from `%struct.with.ptrs addrspace(1)* undef` + 136, addrspace 1)
+  ; CHECK:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 144
+  ; CHECK:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
+  ; CHECK:   [[LOAD3:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD2]](p1) :: (volatile load (<2 x p1>) from `%struct.with.ptrs addrspace(1)* undef` + 144, addrspace 1)
+  ; CHECK:   G_STORE [[LOAD]](<32 x s32>), [[COPY]](p5) :: (store (<32 x s32>), addrspace 5)
+  ; CHECK:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+  ; CHECK:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+  ; CHECK:   G_STORE [[LOAD1]](p3), [[PTR_ADD3]](p5) :: (store (p3), align 128, addrspace 5)
+  ; CHECK:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 136
+  ; CHECK:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+  ; CHECK:   G_STORE [[LOAD2]](p1), [[PTR_ADD4]](p5) :: (store (p1), addrspace 5)
+  ; CHECK:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 144
+  ; CHECK:   [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+  ; CHECK:   G_STORE [[LOAD3]](<2 x p1>), [[PTR_ADD5]](p5) :: (store (<2 x p1>), addrspace 5)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]]
   %val = load volatile %struct.with.ptrs, %struct.with.ptrs addrspace(1)* undef
   ret %struct.with.ptrs %val
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
index 17a1a86cc649f..420b9ef437b19 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
@@ -16,89 +16,113 @@
 define i32 addrspace(4)* @external_constant_got() {
   ; GCN-LABEL: name: external_constant_got
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant + 4, target-flags(amdgpu-gotprel32-hi) @external_constant + 12, implicit-def $scc
   ; GCN:   [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load (p4) from got, addrspace 4)
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p4)
   ; GCN:   $vgpr0 = COPY [[UV]](s32)
   ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; GCN:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   ret i32 addrspace(4)* @external_constant
 }
 
 define i32 addrspace(1)* @external_global_got() {
   ; GCN-LABEL: name: external_global_got
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_global + 4, target-flags(amdgpu-gotprel32-hi) @external_global + 12, implicit-def $scc
   ; GCN:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load (p1) from got, addrspace 4)
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p1)
   ; GCN:   $vgpr0 = COPY [[UV]](s32)
   ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; GCN:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   ret i32 addrspace(1)* @external_global
 }
 
 define i32 addrspace(999)* @external_other_got() {
   ; GCN-LABEL: name: external_other_got
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_other + 4, target-flags(amdgpu-gotprel32-hi) @external_other + 12, implicit-def $scc
   ; GCN:   [[LOAD:%[0-9]+]]:_(p999) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load (p999) from got, addrspace 4)
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p999)
   ; GCN:   $vgpr0 = COPY [[UV]](s32)
   ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; GCN:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   ret i32 addrspace(999)* @external_other
 }
 
 define i32 addrspace(4)* @internal_constant_pcrel() {
   ; GCN-LABEL: name: internal_constant_pcrel
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_constant + 4, target-flags(amdgpu-rel32-hi) @internal_constant + 12, implicit-def $scc
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SI_PC_ADD_REL_OFFSET]](p4)
   ; GCN:   $vgpr0 = COPY [[UV]](s32)
   ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; GCN:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   ret i32 addrspace(4)* @internal_constant
 }
 
 define i32 addrspace(1)* @internal_global_pcrel() {
   ; GCN-LABEL: name: internal_global_pcrel
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p1) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_global + 4, target-flags(amdgpu-rel32-hi) @internal_global + 12, implicit-def $scc
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SI_PC_ADD_REL_OFFSET]](p1)
   ; GCN:   $vgpr0 = COPY [[UV]](s32)
   ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; GCN:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   ret i32 addrspace(1)* @internal_global
 }
 
 define i32 addrspace(999)* @internal_other_pcrel() {
   ; GCN-LABEL: name: internal_other_pcrel
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p999) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_other + 4, target-flags(amdgpu-rel32-hi) @internal_other + 12, implicit-def $scc
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SI_PC_ADD_REL_OFFSET]](p999)
   ; GCN:   $vgpr0 = COPY [[UV]](s32)
   ; GCN:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; GCN:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
   ret i32 addrspace(999)* @internal_other
 }
 
 define i32 addrspace(6)* @external_constant32_got() {
   ; GCN-LABEL: name: external_constant32_got
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant32 + 4, target-flags(amdgpu-gotprel32-hi) @external_constant32 + 12, implicit-def $scc
   ; GCN:   [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load (p4) from got, addrspace 4)
   ; GCN:   [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[LOAD]](p4), 0
   ; GCN:   $vgpr0 = COPY [[EXTRACT]](p6)
-  ; GCN:   SI_RETURN implicit $vgpr0
+  ; GCN:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   ret i32 addrspace(6)* @external_constant32
 }
 
 define i32 addrspace(6)* @internal_constant32_pcrel() {
   ; GCN-LABEL: name: internal_constant32_pcrel
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_constant32 + 4, target-flags(amdgpu-rel32-hi) @internal_constant32 + 12, implicit-def $scc
   ; GCN:   [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[SI_PC_ADD_REL_OFFSET]](p4), 0
   ; GCN:   $vgpr0 = COPY [[EXTRACT]](p6)
-  ; GCN:   SI_RETURN implicit $vgpr0
+  ; GCN:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   ret i32 addrspace(6)* @internal_constant32
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
index 5206284cf5723..9ed3ec23a181e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
@@ -4,13 +4,15 @@
 define float @test_atomicrmw_fadd(float addrspace(3)* %addr) {
   ; CHECK-LABEL: name: test_atomicrmw_fadd
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; CHECK-NEXT:   [[ATOMICRMW_FADD:%[0-9]+]]:_(s32) = G_ATOMICRMW_FADD [[COPY]](p3), [[C]] :: (load store seq_cst (s32) on %ir.addr, addrspace 3)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ATOMICRMW_FADD]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
   %oldval = atomicrmw fadd float addrspace(3)* %addr, float 1.0 seq_cst
   ret float %oldval
 }
@@ -19,9 +21,10 @@ define float @test_atomicrmw_fsub(float addrspace(3)* %addr) {
   ; CHECK-LABEL: name: test_atomicrmw_fsub
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32) from %ir.addr, addrspace 3)
@@ -30,8 +33,8 @@ define float @test_atomicrmw_fsub(float addrspace(3)* %addr) {
   ; CHECK-NEXT: bb.2.atomicrmw.start:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s64) = G_PHI %8(s64), %bb.2, [[C1]](s64), %bb.1
-  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %6(s32), %bb.2
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s64) = G_PHI %9(s64), %bb.2, [[C1]](s64), %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %7(s32), %bb.2
   ; CHECK-NEXT:   [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[PHI1]], [[C]]
   ; CHECK-NEXT:   [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[PHI1]], [[FSUB]] :: (load store seq_cst seq_cst (s32) on %ir.2, addrspace 3)
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64)
@@ -44,7 +47,8 @@ define float @test_atomicrmw_fsub(float addrspace(3)* %addr) {
   ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2
   ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[PHI2]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
   %oldval = atomicrmw fsub float addrspace(3)* %addr, float 1.0 seq_cst
   ret float %oldval
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
index 7015bd8f9c965..2e647da9b5df3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
@@ -125,7 +125,7 @@ define amdgpu_kernel void @kernel_call_no_other_sgprs() {
 define void @func_call_no_workitem_ids() {
   ; CHECK-LABEL: name: func_call_no_workitem_ids
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr14
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr13
@@ -134,27 +134,29 @@ define void @func_call_no_workitem_ids() {
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY7]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY8]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY10]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY11]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY12]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY15]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @extern, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY7]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY16]]
   call void @extern() "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z"
   ret void
 }
@@ -162,30 +164,32 @@ define void @func_call_no_workitem_ids() {
 define void @func_call_no_workgroup_ids() {
   ; CHECK-LABEL: name: func_call_no_workgroup_ids
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(p4) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s64) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY10]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY6]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY7]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY8]](s64)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY9]](s32)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY11]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY6]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY7]](p4)
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY8]](p4)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY9]](s64)
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY10]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @extern, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY5]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY12]]
   call void @extern() "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z"
   ret void
 }
@@ -193,21 +197,23 @@ define void @func_call_no_workgroup_ids() {
 define void @func_call_no_other_sgprs() {
   ; CHECK-LABEL: name: func_call_no_other_sgprs
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr31, $sgpr8_sgpr9
+  ; CHECK-NEXT:   liveins: $vgpr31, $sgpr8_sgpr9, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY2]](p4)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p4) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY5]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY3]](p4)
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY4]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @extern, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr8_sgpr9, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY6]]
   call void @extern() "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z"
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
index 851e893bead77..00f2e4b620c22 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
@@ -116,7 +116,7 @@ define amdgpu_kernel void @test_call_external_void_func_i32([17 x i8]) #0 {
 define void @test_func_call_external_void_func_i32() #0 {
   ; GFX900-LABEL: name: test_func_call_external_void_func_i32
   ; GFX900: bb.1 (%ir-block.0):
-  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GFX900-NEXT: {{  $}}
   ; GFX900-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -126,34 +126,36 @@ define void @test_func_call_external_void_func_i32() #0 {
   ; GFX900-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
   ; GFX900-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
   ; GFX900-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX900-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 99
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX900-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX900-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX900-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GFX900-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GFX900-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GFX900-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX900-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX900-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX900-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX900-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
-  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
-  ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[COPY10]](p4)
-  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; GFX900-NEXT:   $vgpr31 = COPY [[COPY15]](s32)
+  ; GFX900-NEXT:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
+  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
+  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
+  ; GFX900-NEXT:   $vgpr31 = COPY [[COPY16]](s32)
   ; GFX900-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   ; GFX900-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GFX900-NEXT:   SI_RETURN
+  ; GFX900-NEXT:   [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; GFX900-NEXT:   S_SETPC_B64_return [[COPY18]]
   ; GFX908-LABEL: name: test_func_call_external_void_func_i32
   ; GFX908: bb.1 (%ir-block.0):
-  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -163,31 +165,33 @@ define void @test_func_call_external_void_func_i32() #0 {
   ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
   ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
   ; GFX908-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX908-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 99
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32
-  ; GFX908-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX908-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX908-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GFX908-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX908-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX908-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX908-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GFX908-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
-  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
-  ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[COPY10]](p4)
-  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; GFX908-NEXT:   $vgpr31 = COPY [[COPY15]](s32)
+  ; GFX908-NEXT:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
+  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
+  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
+  ; GFX908-NEXT:   $vgpr31 = COPY [[COPY16]](s32)
   ; GFX908-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   ; GFX908-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GFX908-NEXT:   SI_RETURN
+  ; GFX908-NEXT:   [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; GFX908-NEXT:   S_SETPC_B64_return [[COPY18]]
   call void @external_void_func_i32(i32 99)
   ret void
 }
@@ -374,7 +378,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 {
 define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX900-LABEL: name: test_func_call_external_void_func_v32i32
   ; GFX900: bb.1 (%ir-block.1):
-  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX900-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GFX900-NEXT: {{  $}}
   ; GFX900-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GFX900-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -435,22 +439,23 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX900-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr16
   ; GFX900-NEXT:   [[TRUNC32:%[0-9]+]]:_(s16) = G_TRUNC [[COPY24]](s32)
   ; GFX900-NEXT:   [[TRUNC33:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC32]](s16)
+  ; GFX900-NEXT:   [[COPY25:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX900-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX900-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
   ; GFX900-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX900-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32
-  ; GFX900-NEXT:   [[COPY25:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX900-NEXT:   [[COPY26:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX900-NEXT:   [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GFX900-NEXT:   [[COPY28:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GFX900-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX900-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX900-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GFX900-NEXT:   [[COPY32:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX900-NEXT:   [[COPY26:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX900-NEXT:   [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GFX900-NEXT:   [[COPY28:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GFX900-NEXT:   [[COPY29:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GFX900-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX900-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX900-NEXT:   [[COPY32:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GFX900-NEXT:   [[COPY33:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX900-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
-  ; GFX900-NEXT:   [[COPY33:%[0-9]+]]:_(p5) = COPY $sgpr32
+  ; GFX900-NEXT:   [[COPY34:%[0-9]+]]:_(p5) = COPY $sgpr32
   ; GFX900-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY33]], [[C1]](s32)
+  ; GFX900-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY34]], [[C1]](s32)
   ; GFX900-NEXT:   G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
   ; GFX900-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX900-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -483,22 +488,23 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX900-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; GFX900-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; GFX900-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; GFX900-NEXT:   [[COPY34:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY34]](<4 x s32>)
-  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY25]](p4)
-  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY26]](p4)
-  ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[COPY27]](p4)
-  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY28]](s64)
-  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY29]](s32)
-  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY30]](s32)
-  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY31]](s32)
-  ; GFX900-NEXT:   $vgpr31 = COPY [[COPY32]](s32)
+  ; GFX900-NEXT:   [[COPY35:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX900-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY35]](<4 x s32>)
+  ; GFX900-NEXT:   $sgpr4_sgpr5 = COPY [[COPY26]](p4)
+  ; GFX900-NEXT:   $sgpr6_sgpr7 = COPY [[COPY27]](p4)
+  ; GFX900-NEXT:   $sgpr8_sgpr9 = COPY [[COPY28]](p4)
+  ; GFX900-NEXT:   $sgpr10_sgpr11 = COPY [[COPY29]](s64)
+  ; GFX900-NEXT:   $sgpr12 = COPY [[COPY30]](s32)
+  ; GFX900-NEXT:   $sgpr13 = COPY [[COPY31]](s32)
+  ; GFX900-NEXT:   $sgpr14 = COPY [[COPY32]](s32)
+  ; GFX900-NEXT:   $vgpr31 = COPY [[COPY33]](s32)
   ; GFX900-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_v32i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   ; GFX900-NEXT:   ADJCALLSTACKDOWN 0, 4, implicit-def $scc
-  ; GFX900-NEXT:   SI_RETURN
+  ; GFX900-NEXT:   [[COPY36:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY25]]
+  ; GFX900-NEXT:   S_SETPC_B64_return [[COPY36]]
   ; GFX908-LABEL: name: test_func_call_external_void_func_v32i32
   ; GFX908: bb.1 (%ir-block.1):
-  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GFX908-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -559,22 +565,23 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX908-NEXT:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr16
   ; GFX908-NEXT:   [[TRUNC32:%[0-9]+]]:_(s16) = G_TRUNC [[COPY24]](s32)
   ; GFX908-NEXT:   [[TRUNC33:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC32]](s16)
+  ; GFX908-NEXT:   [[COPY25:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX908-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX908-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
   ; GFX908-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GFX908-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32
-  ; GFX908-NEXT:   [[COPY25:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GFX908-NEXT:   [[COPY26:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GFX908-NEXT:   [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GFX908-NEXT:   [[COPY28:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GFX908-NEXT:   [[COPY29:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GFX908-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GFX908-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GFX908-NEXT:   [[COPY32:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GFX908-NEXT:   [[COPY26:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GFX908-NEXT:   [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GFX908-NEXT:   [[COPY28:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GFX908-NEXT:   [[COPY29:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GFX908-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GFX908-NEXT:   [[COPY31:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GFX908-NEXT:   [[COPY32:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GFX908-NEXT:   [[COPY33:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GFX908-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>)
-  ; GFX908-NEXT:   [[COPY33:%[0-9]+]]:_(p5) = COPY $sgpr32
+  ; GFX908-NEXT:   [[COPY34:%[0-9]+]]:_(p5) = COPY $sgpr32
   ; GFX908-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY33]], [[C1]](s32)
+  ; GFX908-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY34]], [[C1]](s32)
   ; GFX908-NEXT:   G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
   ; GFX908-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX908-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -607,19 +614,20 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 {
   ; GFX908-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; GFX908-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; GFX908-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; GFX908-NEXT:   [[COPY34:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY34]](<4 x s32>)
-  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY25]](p4)
-  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY26]](p4)
-  ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[COPY27]](p4)
-  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY28]](s64)
-  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY29]](s32)
-  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY30]](s32)
-  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY31]](s32)
-  ; GFX908-NEXT:   $vgpr31 = COPY [[COPY32]](s32)
+  ; GFX908-NEXT:   [[COPY35:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX908-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY35]](<4 x s32>)
+  ; GFX908-NEXT:   $sgpr4_sgpr5 = COPY [[COPY26]](p4)
+  ; GFX908-NEXT:   $sgpr6_sgpr7 = COPY [[COPY27]](p4)
+  ; GFX908-NEXT:   $sgpr8_sgpr9 = COPY [[COPY28]](p4)
+  ; GFX908-NEXT:   $sgpr10_sgpr11 = COPY [[COPY29]](s64)
+  ; GFX908-NEXT:   $sgpr12 = COPY [[COPY30]](s32)
+  ; GFX908-NEXT:   $sgpr13 = COPY [[COPY31]](s32)
+  ; GFX908-NEXT:   $sgpr14 = COPY [[COPY32]](s32)
+  ; GFX908-NEXT:   $vgpr31 = COPY [[COPY33]](s32)
   ; GFX908-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_v32i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   ; GFX908-NEXT:   ADJCALLSTACKDOWN 0, 4, implicit-def $scc
-  ; GFX908-NEXT:   SI_RETURN
+  ; GFX908-NEXT:   [[COPY36:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY25]]
+  ; GFX908-NEXT:   S_SETPC_B64_return [[COPY36]]
   call void @external_void_func_v32i32(<32 x i32> zeroinitializer)
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
index cb06c47525207..67cb7683fbf5b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
@@ -11,13 +11,17 @@ declare hidden amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8,
 define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
   ; CHECK-LABEL: name: test_gfx_call_external_void_func_void
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   call amdgpu_gfx void @external_gfx_void_func_void()
   ret void
 }
@@ -25,18 +29,20 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
 define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
   ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32
   ; CHECK-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   call amdgpu_gfx void @external_gfx_void_func_i32(i32 42)
   ret void
 }
@@ -44,18 +50,20 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
 define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg) #0 {
   ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm_inreg
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr4
+  ; CHECK-NEXT:   liveins: $sgpr4, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
   ; CHECK-NEXT:   $sgpr4 = COPY [[C]](s32)
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
   ret void
 }
@@ -63,6 +71,9 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
 define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
   ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load (s8) from %ir.ptr0, align 4, addrspace 1)
@@ -75,11 +86,12 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT1]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[LOAD2]](s32)
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
   call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val)
@@ -89,6 +101,9 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
 define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #0 {
   ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32_inreg
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load (s8) from %ir.ptr0, align 4, addrspace 1)
@@ -101,11 +116,12 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
   ; CHECK-NEXT:   $sgpr4 = COPY [[ANYEXT1]](s32)
   ; CHECK-NEXT:   $sgpr5 = COPY [[LOAD2]](s32)
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
   call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 1ae8ee55c7876..17d8f497cbcdb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -129,22 +129,24 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)*
 define amdgpu_gfx void @test_gfx_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
   ; GCN-LABEL: name: test_gfx_call_external_i32_func_i32_imm
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i32_func_i32
   ; GCN-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
   ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[COPY3]](s32), [[MV]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
-  ; GCN-NEXT:   SI_RETURN
+  ; GCN-NEXT:   G_STORE [[COPY4]](s32), [[MV]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY2]]
+  ; GCN-NEXT:   S_SETPC_B64_return_gfx [[COPY5]]
   %val = call amdgpu_gfx i32 @external_gfx_i32_func_i32(i32 42)
   store volatile i32 %val, i32 addrspace(1)* %out
   ret void
@@ -210,17 +212,21 @@ define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
 define amdgpu_gfx void @test_gfx_call_external_i1_func_void() #0 {
   ; GCN-LABEL: name: test_gfx_call_external_i1_func_void
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr30_sgpr31
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i1_func_void
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
   ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i1_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `i1 addrspace(1)* undef`, addrspace 1)
-  ; GCN-NEXT:   SI_RETURN
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   %val = call amdgpu_gfx i1 @external_gfx_i1_func_void()
   store volatile i1 %val, i1 addrspace(1)* undef
   ret void
@@ -405,18 +411,22 @@ define amdgpu_kernel void @test_call_external_i8_func_void() #0 {
 define amdgpu_gfx void @test_gfx_call_external_i8_func_void() #0 {
   ; GCN-LABEL: name: test_gfx_call_external_i8_func_void
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr30_sgpr31
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i8_func_void
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
   ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i8_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
   ; GCN-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `i8 addrspace(1)* undef`, addrspace 1)
-  ; GCN-NEXT:   SI_RETURN
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   %val = call amdgpu_gfx i8 @external_gfx_i8_func_void()
   store volatile i8 %val, i8 addrspace(1)* undef
   ret void
@@ -776,16 +786,20 @@ define amdgpu_kernel void @test_call_external_i32_func_void() #0 {
 define amdgpu_gfx void @test_gfx_call_external_i32_func_void() #0 {
   ; GCN-LABEL: name: test_gfx_call_external_i32_func_void
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr30_sgpr31
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i32_func_void
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
   ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[COPY1]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; GCN-NEXT:   SI_RETURN
+  ; GCN-NEXT:   G_STORE [[COPY2]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   %val = call amdgpu_gfx i32 @external_gfx_i32_func_void()
   store volatile i32 %val, i32 addrspace(1)* undef
   ret void
@@ -2495,21 +2509,25 @@ define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 {
 define amdgpu_gfx void @test_gfx_call_external_i32_i64_func_void() #0 {
   ; GCN-LABEL: name: test_gfx_call_external_i32_i64_func_void
   ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr30_sgpr31
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i32_i64_func_void
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
   ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_i64_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN-NEXT:   G_STORE [[COPY2]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; GCN-NEXT:   G_STORE [[MV]](s64), [[COPY]](p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
-  ; GCN-NEXT:   SI_RETURN
+  ; GCN-NEXT:   G_STORE [[COPY3]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN-NEXT:   G_STORE [[MV]](s64), [[COPY1]](p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN-NEXT:   S_SETPC_B64_return_gfx [[COPY6]]
   %val = call amdgpu_gfx { i32, i64 } @external_gfx_i32_i64_func_void()
   %val.0 = extractvalue { i32, i64 } %val, 0
   %val.1 = extractvalue { i32, i64 } %val, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 9a7552591c5ab..d81a40bfe6d04 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -149,13 +149,17 @@ define amdgpu_kernel void @test_call_external_void_func_void() #0 {
 define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
   ; CHECK-LABEL: name: test_gfx_call_external_void_func_void
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   call amdgpu_gfx void @external_gfx_void_func_void()
   ret void
 }
@@ -163,7 +167,7 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
 define void @test_func_call_external_void_func_void() #0 {
   ; CHECK-LABEL: name: test_func_call_external_void_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -173,29 +177,31 @@ define void @test_func_call_external_void_func_void() #0 {
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_void
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY18]]
   call void @external_void_func_void()
   ret void
 }
@@ -883,18 +889,20 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
   ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32
   ; CHECK-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   call amdgpu_gfx void @external_gfx_void_func_i32(i32 42)
   ret void
 }
@@ -902,18 +910,20 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
 define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg) #0 {
   ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm_inreg
   ; CHECK: bb.1 (%ir-block.1):
-  ; CHECK-NEXT:   liveins: $sgpr4
+  ; CHECK-NEXT:   liveins: $sgpr4, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
   ; CHECK-NEXT:   $sgpr4 = COPY [[C]](s32)
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
   ret void
 }
@@ -3866,6 +3876,9 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
   ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load (s8) from %ir.ptr0, align 4, addrspace 1)
@@ -3878,11 +3891,12 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT1]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[LOAD2]](s32)
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
   call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val)
@@ -3892,6 +3906,9 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
 define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #0 {
   ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32_inreg
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load (s8) from %ir.ptr0, align 4, addrspace 1)
@@ -3904,11 +3921,12 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
   ; CHECK-NEXT:   $sgpr4 = COPY [[ANYEXT1]](s32)
   ; CHECK-NEXT:   $sgpr5 = COPY [[LOAD2]](s32)
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
   call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val)
@@ -3989,7 +4007,7 @@ declare void @void_func_byval_a3i32_byval_i8_align32([3 x i32] addrspace(5)* byv
 define void @call_byval_3ai32_byval_i8_align32([3 x i32] addrspace(5)* %incoming0, i8 addrspace(5)* align 32 %incoming1) #0 {
   ; CHECK-LABEL: name: call_byval_3ai32_byval_i8_align32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -4001,40 +4019,42 @@ define void @call_byval_3ai32_byval_i8_align32([3 x i32] addrspace(5)* %incoming
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p5) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p5) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 999
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_byval_a3i32_byval_i8_align32
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(p5) = COPY $sgpr32
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(p5) = COPY $sgpr32
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY18]], [[C1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C1]](s32)
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C2]](s32), 0 :: (dereferenceable store (s96) into stack, align 4, addrspace 5), (dereferenceable load (s96) from %ir.incoming0, align 4, addrspace 5)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY18]], [[C3]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C3]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   G_MEMCPY [[PTR_ADD1]](p5), [[COPY9]](p5), [[C4]](s32), 0 :: (dereferenceable store (s8) into stack + 32, align 32, addrspace 5), (dereferenceable load (s8) from %ir.incoming1, align 32, addrspace 5)
   ; CHECK-NEXT:   $vgpr0 = COPY [[C]](s32)
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY17]](s32)
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY13]](p4)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY16]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @void_func_byval_a3i32_byval_i8_align32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 36, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY21]]
   call void @void_func_byval_a3i32_byval_i8_align32([3 x i32] addrspace(5)* byval([3 x i32]) %incoming0, i8 addrspace(5)* align 32 %incoming1, i32 999)
   ret void
 }
@@ -4046,7 +4066,7 @@ declare void @void_func_byval_a4i64_align4([4 x i64] addrspace(5)* byval([4 x i6
 define void @call_byval_a4i64_align4_higher_source_align([4 x i64] addrspace(5)* align 256 %incoming_high_align) #0 {
   ; CHECK-LABEL: name: call_byval_a4i64_align4_higher_source_align
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -4057,34 +4077,36 @@ define void @call_byval_a4i64_align4_higher_source_align([4 x i64] addrspace(5)*
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p5) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_byval_a4i64_align4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(p5) = COPY $sgpr32
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(p5) = COPY $sgpr32
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY18]], [[C]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
   ; CHECK-NEXT:   G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C1]](s32), 0 :: (dereferenceable store (s256) into stack, align 4, addrspace 5), (dereferenceable load (s256) from %ir.incoming_high_align, align 256, addrspace 5)
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY16]](s32)
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @void_func_byval_a4i64_align4, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 32, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY20]]
   call void @void_func_byval_a4i64_align4([4 x i64] addrspace(5)* byval([4 x i64]) align 4 %incoming_high_align)
   ret void
 }
@@ -4582,7 +4604,7 @@ entry:
 define void @stack_12xv3i32() #0 {
   ; CHECK-LABEL: name: stack_12xv3i32
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -4592,6 +4614,7 @@ define void @stack_12xv3i32() #0 {
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
@@ -4622,14 +4645,14 @@ define void @stack_12xv3i32() #0 {
   ; CHECK-NEXT:   [[BUILD_VECTOR11:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C13]](s32), [[C14]](s32), [[C15]](s32)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_12xv3i32
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
   ; CHECK-NEXT:   [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<3 x s32>)
   ; CHECK-NEXT:   [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<3 x s32>)
@@ -4641,22 +4664,22 @@ define void @stack_12xv3i32() #0 {
   ; CHECK-NEXT:   [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR8]](<3 x s32>)
   ; CHECK-NEXT:   [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR9]](<3 x s32>)
   ; CHECK-NEXT:   [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR10]](<3 x s32>)
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(p5) = COPY $sgpr32
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(p5) = COPY $sgpr32
   ; CHECK-NEXT:   [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C16]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C16]](s32)
   ; CHECK-NEXT:   G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
   ; CHECK-NEXT:   [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C17]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C17]](s32)
   ; CHECK-NEXT:   G_STORE [[UV32]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack + 4, addrspace 5)
   ; CHECK-NEXT:   [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR11]](<3 x s32>)
   ; CHECK-NEXT:   [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C18]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C18]](s32)
   ; CHECK-NEXT:   G_STORE [[UV33]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack + 8, align 8, addrspace 5)
   ; CHECK-NEXT:   [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-  ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C19]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C19]](s32)
   ; CHECK-NEXT:   G_STORE [[UV34]](s32), [[PTR_ADD3]](p5) :: (store (s32) into stack + 12, addrspace 5)
   ; CHECK-NEXT:   [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C20]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C20]](s32)
   ; CHECK-NEXT:   G_STORE [[UV35]](s32), [[PTR_ADD4]](p5) :: (store (s32) into stack + 16, align 16, addrspace 5)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -4689,19 +4712,20 @@ define void @stack_12xv3i32() #0 {
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_12xv3i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 20, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY19]]
 entry:
   call void @external_void_func_12xv3i32(
       <3 x i32> <i32 0, i32 0, i32 0>,
@@ -4722,7 +4746,7 @@ entry:
 define void @stack_12xv3f32() #0 {
   ; CHECK-LABEL: name: stack_12xv3f32
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -4732,6 +4756,7 @@ define void @stack_12xv3f32() #0 {
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
@@ -4762,14 +4787,14 @@ define void @stack_12xv3f32() #0 {
   ; CHECK-NEXT:   [[BUILD_VECTOR11:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C13]](s32), [[C14]](s32), [[C15]](s32)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_12xv3f32
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
   ; CHECK-NEXT:   [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<3 x s32>)
   ; CHECK-NEXT:   [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<3 x s32>)
@@ -4781,22 +4806,22 @@ define void @stack_12xv3f32() #0 {
   ; CHECK-NEXT:   [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR8]](<3 x s32>)
   ; CHECK-NEXT:   [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR9]](<3 x s32>)
   ; CHECK-NEXT:   [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR10]](<3 x s32>)
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(p5) = COPY $sgpr32
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(p5) = COPY $sgpr32
   ; CHECK-NEXT:   [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C16]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C16]](s32)
   ; CHECK-NEXT:   G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
   ; CHECK-NEXT:   [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C17]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C17]](s32)
   ; CHECK-NEXT:   G_STORE [[UV32]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack + 4, addrspace 5)
   ; CHECK-NEXT:   [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR11]](<3 x s32>)
   ; CHECK-NEXT:   [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C18]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C18]](s32)
   ; CHECK-NEXT:   G_STORE [[UV33]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack + 8, align 8, addrspace 5)
   ; CHECK-NEXT:   [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-  ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C19]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C19]](s32)
   ; CHECK-NEXT:   G_STORE [[UV34]](s32), [[PTR_ADD3]](p5) :: (store (s32) into stack + 12, addrspace 5)
   ; CHECK-NEXT:   [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C20]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C20]](s32)
   ; CHECK-NEXT:   G_STORE [[UV35]](s32), [[PTR_ADD4]](p5) :: (store (s32) into stack + 16, align 16, addrspace 5)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -4829,19 +4854,20 @@ define void @stack_12xv3f32() #0 {
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_12xv3f32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 20, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY19]]
 entry:
   call void @external_void_func_12xv3f32(
       <3 x float> <float 0.0, float 0.0, float 0.0>,
@@ -4862,7 +4888,7 @@ entry:
 define void @stack_8xv5i32() #0 {
   ; CHECK-LABEL: name: stack_8xv5i32
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -4872,6 +4898,7 @@ define void @stack_8xv5i32() #0 {
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
@@ -4898,14 +4925,14 @@ define void @stack_8xv5i32() #0 {
   ; CHECK-NEXT:   [[BUILD_VECTOR7:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C11]](s32), [[C12]](s32), [[C13]](s32), [[C14]](s32), [[C15]](s32)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_8xv5i32
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<5 x s32>)
   ; CHECK-NEXT:   [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<5 x s32>)
   ; CHECK-NEXT:   [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<5 x s32>)
@@ -4913,34 +4940,34 @@ define void @stack_8xv5i32() #0 {
   ; CHECK-NEXT:   [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR4]](<5 x s32>)
   ; CHECK-NEXT:   [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR5]](<5 x s32>)
   ; CHECK-NEXT:   [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR6]](<5 x s32>)
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(p5) = COPY $sgpr32
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(p5) = COPY $sgpr32
   ; CHECK-NEXT:   [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C16]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C16]](s32)
   ; CHECK-NEXT:   G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
   ; CHECK-NEXT:   [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C17]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C17]](s32)
   ; CHECK-NEXT:   G_STORE [[UV32]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack + 4, addrspace 5)
   ; CHECK-NEXT:   [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C18]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C18]](s32)
   ; CHECK-NEXT:   G_STORE [[UV33]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack + 8, align 8, addrspace 5)
   ; CHECK-NEXT:   [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-  ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C19]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C19]](s32)
   ; CHECK-NEXT:   G_STORE [[UV34]](s32), [[PTR_ADD3]](p5) :: (store (s32) into stack + 12, addrspace 5)
   ; CHECK-NEXT:   [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR7]](<5 x s32>)
   ; CHECK-NEXT:   [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C20]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C20]](s32)
   ; CHECK-NEXT:   G_STORE [[UV35]](s32), [[PTR_ADD4]](p5) :: (store (s32) into stack + 16, align 16, addrspace 5)
   ; CHECK-NEXT:   [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C21]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C21]](s32)
   ; CHECK-NEXT:   G_STORE [[UV36]](s32), [[PTR_ADD5]](p5) :: (store (s32) into stack + 20, addrspace 5)
   ; CHECK-NEXT:   [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-  ; CHECK-NEXT:   [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C22]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C22]](s32)
   ; CHECK-NEXT:   G_STORE [[UV37]](s32), [[PTR_ADD6]](p5) :: (store (s32) into stack + 24, align 8, addrspace 5)
   ; CHECK-NEXT:   [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
-  ; CHECK-NEXT:   [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C23]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C23]](s32)
   ; CHECK-NEXT:   G_STORE [[UV38]](s32), [[PTR_ADD7]](p5) :: (store (s32) into stack + 28, addrspace 5)
   ; CHECK-NEXT:   [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-  ; CHECK-NEXT:   [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C24]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C24]](s32)
   ; CHECK-NEXT:   G_STORE [[UV39]](s32), [[PTR_ADD8]](p5) :: (store (s32) into stack + 32, align 16, addrspace 5)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -4973,19 +5000,20 @@ define void @stack_8xv5i32() #0 {
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_8xv5i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 36, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY19]]
 entry:
   call void @external_void_func_8xv5i32(
       <5 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0>,
@@ -5002,7 +5030,7 @@ entry:
 define void @stack_8xv5f32() #0 {
   ; CHECK-LABEL: name: stack_8xv5f32
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -5012,6 +5040,7 @@ define void @stack_8xv5f32() #0 {
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
@@ -5038,14 +5067,14 @@ define void @stack_8xv5f32() #0 {
   ; CHECK-NEXT:   [[BUILD_VECTOR7:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C11]](s32), [[C12]](s32), [[C13]](s32), [[C14]](s32), [[C15]](s32)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_8xv5f32
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<5 x s32>)
   ; CHECK-NEXT:   [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<5 x s32>)
   ; CHECK-NEXT:   [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<5 x s32>)
@@ -5053,34 +5082,34 @@ define void @stack_8xv5f32() #0 {
   ; CHECK-NEXT:   [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR4]](<5 x s32>)
   ; CHECK-NEXT:   [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR5]](<5 x s32>)
   ; CHECK-NEXT:   [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR6]](<5 x s32>)
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(p5) = COPY $sgpr32
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(p5) = COPY $sgpr32
   ; CHECK-NEXT:   [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C16]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C16]](s32)
   ; CHECK-NEXT:   G_STORE [[UV31]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
   ; CHECK-NEXT:   [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C17]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C17]](s32)
   ; CHECK-NEXT:   G_STORE [[UV32]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack + 4, addrspace 5)
   ; CHECK-NEXT:   [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C18]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C18]](s32)
   ; CHECK-NEXT:   G_STORE [[UV33]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack + 8, align 8, addrspace 5)
   ; CHECK-NEXT:   [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-  ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C19]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C19]](s32)
   ; CHECK-NEXT:   G_STORE [[UV34]](s32), [[PTR_ADD3]](p5) :: (store (s32) into stack + 12, addrspace 5)
   ; CHECK-NEXT:   [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR7]](<5 x s32>)
   ; CHECK-NEXT:   [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C20]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C20]](s32)
   ; CHECK-NEXT:   G_STORE [[UV35]](s32), [[PTR_ADD4]](p5) :: (store (s32) into stack + 16, align 16, addrspace 5)
   ; CHECK-NEXT:   [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-  ; CHECK-NEXT:   [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C21]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C21]](s32)
   ; CHECK-NEXT:   G_STORE [[UV36]](s32), [[PTR_ADD5]](p5) :: (store (s32) into stack + 20, addrspace 5)
   ; CHECK-NEXT:   [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-  ; CHECK-NEXT:   [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C22]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C22]](s32)
   ; CHECK-NEXT:   G_STORE [[UV37]](s32), [[PTR_ADD6]](p5) :: (store (s32) into stack + 24, align 8, addrspace 5)
   ; CHECK-NEXT:   [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
-  ; CHECK-NEXT:   [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C23]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C23]](s32)
   ; CHECK-NEXT:   G_STORE [[UV38]](s32), [[PTR_ADD7]](p5) :: (store (s32) into stack + 28, addrspace 5)
   ; CHECK-NEXT:   [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-  ; CHECK-NEXT:   [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY16]], [[C24]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY17]], [[C24]](s32)
   ; CHECK-NEXT:   G_STORE [[UV39]](s32), [[PTR_ADD8]](p5) :: (store (s32) into stack + 32, align 16, addrspace 5)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -5113,19 +5142,20 @@ define void @stack_8xv5f32() #0 {
   ; CHECK-NEXT:   $vgpr28 = COPY [[UV28]](s32)
   ; CHECK-NEXT:   $vgpr29 = COPY [[UV29]](s32)
   ; CHECK-NEXT:   $vgpr30 = COPY [[UV30]](s32)
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_void_func_8xv5f32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 36, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY19]]
 entry:
   call void @external_void_func_8xv5f32(
       <5 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>,

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll
index 138e66ffd6c7a..78ecae87d06f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll
@@ -6,17 +6,21 @@
 define i32 @test() {
   ; CHECK-LABEL: name: test
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
   ; CHECK-NEXT:   [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[C]](s32)
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[INTTOPTR]](p0), [[GV]]
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY [[ZEXT]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY4]](s32)
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0
   ret i32 bitcast (<1 x i32> <i32 extractelement (<1 x i32> bitcast (i32 zext (i1 icmp eq (i32* @var, i32* inttoptr (i32 -1 to i32*)) to i32) to <1 x i32>), i64 0)> to i32)
 }
 
@@ -62,6 +66,9 @@ define amdgpu_kernel void @constantexpr_select_1() {
 define i32 @test_fcmp_constexpr() {
   ; CHECK-LABEL: name: test_fcmp_constexpr
   ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[GV]], [[C]](s64)
@@ -72,7 +79,8 @@ define i32 @test_fcmp_constexpr() {
   ; CHECK-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oeq), [[UITOFP]](s32), [[C1]]
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FCMP]](s1)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ZEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
 entry:
   ret i32 zext (i1 fcmp oeq (float uitofp (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @a, i64 0, i64 1), i32* @var) to float), float 0.000000e+00) to i32)
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
index e01d9c399e937..10218b9a88d07 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
@@ -4,13 +4,15 @@
 define float @v_constained_fadd_f32_fpexcept_strict(float %x, float %y) #0 {
   ; CHECK-LABEL: name: v_constained_fadd_f32_fpexcept_strict
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[STRICT_FADD:%[0-9]+]]:_(s32) = G_STRICT_FADD [[COPY]], [[COPY1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FADD]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %val
 }
@@ -18,13 +20,15 @@ define float @v_constained_fadd_f32_fpexcept_strict(float %x, float %y) #0 {
 define float @v_constained_fadd_f32_fpexcept_strict_flags(float %x, float %y) #0 {
   ; CHECK-LABEL: name: v_constained_fadd_f32_fpexcept_strict_flags
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[STRICT_FADD:%[0-9]+]]:_(s32) = nsz G_STRICT_FADD [[COPY]], [[COPY1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FADD]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %val
 }
@@ -32,13 +36,15 @@ define float @v_constained_fadd_f32_fpexcept_strict_flags(float %x, float %y) #0
 define float @v_constained_fadd_f32_fpexcept_ignore(float %x, float %y) #0 {
   ; CHECK-LABEL: name: v_constained_fadd_f32_fpexcept_ignore
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   %3:_(s32) = nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY %3(s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
 }
@@ -46,13 +52,15 @@ define float @v_constained_fadd_f32_fpexcept_ignore(float %x, float %y) #0 {
 define float @v_constained_fadd_f32_fpexcept_ignore_flags(float %x, float %y) #0 {
   ; CHECK-LABEL: name: v_constained_fadd_f32_fpexcept_ignore_flags
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   %3:_(s32) = nsz nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY %3(s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
 }
@@ -60,13 +68,15 @@ define float @v_constained_fadd_f32_fpexcept_ignore_flags(float %x, float %y) #0
 define float @v_constained_fadd_f32_fpexcept_maytrap(float %x, float %y) #0 {
   ; CHECK-LABEL: name: v_constained_fadd_f32_fpexcept_maytrap
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[STRICT_FADD:%[0-9]+]]:_(s32) = G_STRICT_FADD [[COPY]], [[COPY1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FADD]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret float %val
 }
@@ -74,7 +84,7 @@ define float @v_constained_fadd_f32_fpexcept_maytrap(float %x, float %y) #0 {
 define <2 x float> @v_constained_fadd_v2f32_fpexcept_strict(<2 x float> %x, <2 x float> %y) #0 {
   ; CHECK-LABEL: name: v_constained_fadd_v2f32_fpexcept_strict
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -82,11 +92,13 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_strict(<2 x float> %x, <2 x
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[STRICT_FADD:%[0-9]+]]:_(<2 x s32>) = G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[STRICT_FADD]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x float> %val
 }
@@ -94,7 +106,7 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_strict(<2 x float> %x, <2 x
 define <2 x float> @v_constained_fadd_v2f32_fpexcept_ignore(<2 x float> %x, <2 x float> %y) #0 {
   ; CHECK-LABEL: name: v_constained_fadd_v2f32_fpexcept_ignore
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -102,11 +114,13 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_ignore(<2 x float> %x, <2 x
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK-NEXT:   %6:_(<2 x s32>) = nofpexcept G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   %7:_(<2 x s32>) = nofpexcept G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %val
 }
@@ -114,7 +128,7 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_ignore(<2 x float> %x, <2 x
 define <2 x float> @v_constained_fadd_v2f32_fpexcept_maytrap(<2 x float> %x, <2 x float> %y) #0 {
   ; CHECK-LABEL: name: v_constained_fadd_v2f32_fpexcept_maytrap
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -122,11 +136,13 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_maytrap(<2 x float> %x, <2
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[STRICT_FADD:%[0-9]+]]:_(<2 x s32>) = G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[STRICT_FADD]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x float> %val
 }
@@ -134,13 +150,15 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_maytrap(<2 x float> %x, <2
 define float @v_constained_fsub_f32_fpexcept_ignore_flags(float %x, float %y) #0 {
   ; CHECK-LABEL: name: v_constained_fsub_f32_fpexcept_ignore_flags
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FSUB [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   %3:_(s32) = nsz nofpexcept G_STRICT_FSUB [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY %3(s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
 }
@@ -148,13 +166,15 @@ define float @v_constained_fsub_f32_fpexcept_ignore_flags(float %x, float %y) #0
 define float @v_constained_fmul_f32_fpexcept_ignore_flags(float %x, float %y) #0 {
   ; CHECK-LABEL: name: v_constained_fmul_f32_fpexcept_ignore_flags
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FMUL [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   %3:_(s32) = nsz nofpexcept G_STRICT_FMUL [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY %3(s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
 }
@@ -162,13 +182,15 @@ define float @v_constained_fmul_f32_fpexcept_ignore_flags(float %x, float %y) #0
 define float @v_constained_fdiv_f32_fpexcept_ignore_flags(float %x, float %y) #0 {
   ; CHECK-LABEL: name: v_constained_fdiv_f32_fpexcept_ignore_flags
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FDIV [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   %3:_(s32) = nsz nofpexcept G_STRICT_FDIV [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY %3(s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
 }
@@ -176,13 +198,15 @@ define float @v_constained_fdiv_f32_fpexcept_ignore_flags(float %x, float %y) #0
 define float @v_constained_frem_f32_fpexcept_ignore_flags(float %x, float %y) #0 {
   ; CHECK-LABEL: name: v_constained_frem_f32_fpexcept_ignore_flags
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FREM [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   %3:_(s32) = nsz nofpexcept G_STRICT_FREM [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY %3(s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.frem.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
 }
@@ -190,14 +214,16 @@ define float @v_constained_frem_f32_fpexcept_ignore_flags(float %x, float %y) #0
 define float @v_constained_fma_f32_fpexcept_ignore_flags(float %x, float %y, float %z) #0 {
   ; CHECK-LABEL: name: v_constained_fma_f32_fpexcept_ignore_flags
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK-NEXT:   %3:_(s32) = nsz nofpexcept G_STRICT_FMA [[COPY]], [[COPY1]], [[COPY2]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %3(s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   %4:_(s32) = nsz nofpexcept G_STRICT_FMA [[COPY]], [[COPY1]], [[COPY2]]
+  ; CHECK-NEXT:   $vgpr0 = COPY %4(s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
 }
@@ -205,12 +231,14 @@ define float @v_constained_fma_f32_fpexcept_ignore_flags(float %x, float %y, flo
 define float @v_constained_sqrt_f32_fpexcept_strict(float %x) #0 {
   ; CHECK-LABEL: name: v_constained_sqrt_f32_fpexcept_strict
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[STRICT_FSQRT:%[0-9]+]]:_(s32) = G_STRICT_FSQRT [[COPY]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FSQRT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
   %val = call float @llvm.experimental.constrained.sqrt.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %val
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fixed-function-abi-vgpr-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fixed-function-abi-vgpr-args.ll
index af0dd868c9092..ceba50dc1b7b4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fixed-function-abi-vgpr-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fixed-function-abi-vgpr-args.ll
@@ -7,7 +7,7 @@
 define void @void_a31i32_i32([31 x i32] %arg0, i32 %arg1) {
   ; FIXED-LABEL: name: void_a31i32_i32
   ; FIXED: bb.1 (%ir-block.0):
-  ; FIXED-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; FIXED-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; FIXED-NEXT: {{  $}}
   ; FIXED-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; FIXED-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -42,9 +42,11 @@ define void @void_a31i32_i32([31 x i32] %arg0, i32 %arg1) {
   ; FIXED-NEXT:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; FIXED-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; FIXED-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5)
+  ; FIXED-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; FIXED-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; FIXED-NEXT:   G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; FIXED-NEXT:   SI_RETURN
+  ; FIXED-NEXT:   [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; FIXED-NEXT:   S_SETPC_B64_return [[COPY32]]
   store i32 %arg1, i32 addrspace(1)* undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index fa5daf763c3e9..70a41f9dafaaa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -8,12 +8,14 @@
 define void @void_func_empty_arg({} %arg0, i32 %arg1) #0 {
   ; CHECK-LABEL: name: void_func_empty_arg
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[COPY]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i32 %arg1, i32 addrspace(1)* undef
   ret void
 }
@@ -21,12 +23,14 @@ define void @void_func_empty_arg({} %arg0, i32 %arg1) #0 {
 define void @void_func_empty_array([0 x i8] %arg0, i32 %arg1) #0 {
   ; CHECK-LABEL: name: void_func_empty_array
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[COPY]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i32 %arg1, i32 addrspace(1)* undef
   ret void
 }
@@ -34,13 +38,15 @@ define void @void_func_empty_array([0 x i8] %arg0, i32 %arg1) #0 {
 define void @void_func_i1(i1 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (store (s1) into `i1 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i1 %arg0, i1 addrspace(1)* undef
   ret void
 }
@@ -48,17 +54,19 @@ define void @void_func_i1(i1 %arg0) #0 {
 define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 1
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   %ext = zext i1 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, i32 addrspace(1)* undef
@@ -68,17 +76,19 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 define void @void_func_i1_signext(i1 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i1_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 1
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   %ext = sext i1 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, i32 addrspace(1)* undef
@@ -89,10 +99,11 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-LABEL: name: i1_arg_i1_use
   ; CHECK: bb.1.bb:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
@@ -109,7 +120,8 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.bb2:
   ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s64)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
 bb:
   br i1 %arg, label %bb2, label %bb1
 
@@ -124,14 +136,16 @@ bb2:
 define void @void_func_i8(i8 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (store (s8) into `i8 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i8 %arg0, i8 addrspace(1)* undef
   ret void
 }
@@ -139,17 +153,19 @@ define void @void_func_i8(i8 %arg0) #0 {
 define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i8_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 8
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[ASSERT_ZEXT]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   %ext = zext i8 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, i32 addrspace(1)* undef
@@ -159,17 +175,19 @@ define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
 define void @void_func_i8_signext(i8 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i8_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 8
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[ASSERT_SEXT]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s8)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   %ext = sext i8 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, i32 addrspace(1)* undef
@@ -179,13 +197,15 @@ define void @void_func_i8_signext(i8 signext %arg0) #0 {
 define void @void_func_i16(i16 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s16), [[DEF]](p1) :: (store (s16) into `i16 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i16 %arg0, i16 addrspace(1)* undef
   ret void
 }
@@ -193,17 +213,19 @@ define void @void_func_i16(i16 %arg0) #0 {
 define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i16_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 16
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_ZEXT]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s16)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ZEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   %ext = zext i16 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, i32 addrspace(1)* undef
@@ -213,17 +235,19 @@ define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
 define void @void_func_i16_signext(i16 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i16_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 16
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_SEXT]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s16)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   %ext = sext i16 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, i32 addrspace(1)* undef
@@ -233,13 +257,15 @@ define void @void_func_i16_signext(i16 signext %arg0) #0 {
 define void @void_func_i24(i24 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i24
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s24), [[DEF]](p1) :: (store (s24) into `i24 addrspace(1)* undef`, align 4, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i24 %arg0, i24 addrspace(1)* undef
   ret void
 }
@@ -247,14 +273,16 @@ define void @void_func_i24(i24 %arg0) #0 {
 define void @void_func_i24_zeroext(i24 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i24_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 24
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[ASSERT_ZEXT]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s24), [[DEF]](p1) :: (store (s24) into `i24 addrspace(1)* undef`, align 4, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i24 %arg0, i24 addrspace(1)* undef
   ret void
 }
@@ -262,14 +290,16 @@ define void @void_func_i24_zeroext(i24 zeroext %arg0) #0 {
 define void @void_func_i24_signext(i24 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i24_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 24
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[ASSERT_SEXT]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s24), [[DEF]](p1) :: (store (s24) into `i24 addrspace(1)* undef`, align 4, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i24 %arg0, i24 addrspace(1)* undef
   ret void
 }
@@ -277,12 +307,14 @@ define void @void_func_i24_signext(i24 signext %arg0) #0 {
 define void @void_func_i32(i32 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[COPY]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i32 %arg0, i32 addrspace(1)* undef
   ret void
 }
@@ -291,12 +323,14 @@ define void @void_func_i32(i32 %arg0) #0 {
 define void @void_func_i32_signext(i32 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i32_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[COPY]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i32 %arg0, i32 addrspace(1)* undef
   ret void
 }
@@ -305,12 +339,14 @@ define void @void_func_i32_signext(i32 signext %arg0) #0 {
 define void @void_func_i32_zeroext(i32 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i32_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[COPY]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i32 %arg0, i32 addrspace(1)* undef
   ret void
 }
@@ -318,12 +354,14 @@ define void @void_func_i32_zeroext(i32 zeroext %arg0) #0 {
 define void @void_func_p3i8(i8 addrspace(3)* %arg0) #0 {
   ; CHECK-LABEL: name: void_func_p3i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[COPY]](p3), [[DEF]](p1) :: (store (p3) into `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store i8 addrspace(3)* %arg0, i8 addrspace(3)* addrspace(1)* undef
   ret void
 }
@@ -331,15 +369,17 @@ define void @void_func_p3i8(i8 addrspace(3)* %arg0) #0 {
 define void @void_func_i48(i48 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i48
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s48), [[DEF]](p1) :: (store (s48) into `i48 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store i48 %arg0, i48 addrspace(1)* undef
   ret void
 }
@@ -347,18 +387,20 @@ define void @void_func_i48(i48 %arg0) #0 {
 define void @void_func_i48_zeroext(i48 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i48_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s48)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ZEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s64), [[DEF]](p1) :: (store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   %ext = zext i48 %arg0 to i64
   %add = add i64 %ext, 12
   store i64 %add, i64 addrspace(1)* undef
@@ -368,18 +410,20 @@ define void @void_func_i48_zeroext(i48 zeroext %arg0) #0 {
 define void @void_func_i48_signext(i48 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i48_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s48)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s64) = G_ADD [[SEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s64), [[DEF]](p1) :: (store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   %ext = sext i48 %arg0 to i64
   %add = add i64 %ext, 12
   store i64 %add, i64 addrspace(1)* undef
@@ -389,14 +433,16 @@ define void @void_func_i48_signext(i48 signext %arg0) #0 {
 define void @void_func_i64(i64 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[MV]](s64), [[DEF]](p1) :: (store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store i64 %arg0, i64 addrspace(1)* undef
   ret void
 }
@@ -404,16 +450,18 @@ define void @void_func_i64(i64 %arg0) #0 {
 define void @void_func_i95(i95 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i95
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s95) = G_TRUNC [[MV]](s96)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s95), [[DEF]](p1) :: (store (s95) into `i95 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   store i95 %arg0, i95 addrspace(1)* undef
   ret void
 }
@@ -421,19 +469,21 @@ define void @void_func_i95(i95 %arg0) #0 {
 define void @void_func_i95_zeroext(i95 zeroext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i95_zeroext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s95) = G_TRUNC [[MV]](s96)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s96) = G_CONSTANT i96 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s96) = G_ZEXT [[TRUNC]](s95)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s96) = G_ADD [[ZEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s96), [[DEF]](p1) :: (store (s96) into `i96 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   %ext = zext i95 %arg0 to i96
   %add = add i96 %ext, 12
   store i96 %add, i96 addrspace(1)* undef
@@ -443,19 +493,21 @@ define void @void_func_i95_zeroext(i95 zeroext %arg0) #0 {
 define void @void_func_i95_signext(i95 signext %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i95_signext
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s95) = G_TRUNC [[MV]](s96)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s96) = G_CONSTANT i96 12
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s96) = G_SEXT [[TRUNC]](s95)
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s96) = G_ADD [[SEXT]], [[C]]
   ; CHECK-NEXT:   G_STORE [[ADD]](s96), [[DEF]](p1) :: (store (s96) into `i96 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   %ext = sext i95 %arg0 to i96
   %add = add i96 %ext, 12
   store i96 %add, i96 addrspace(1)* undef
@@ -465,15 +517,17 @@ define void @void_func_i95_signext(i95 signext %arg0) #0 {
 define void @void_func_i96(i96 %arg0) #0 {
   ; CHECK-LABEL: name: void_func_i96
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[MV]](s96), [[DEF]](p1) :: (store (s96) into `i96 addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   store i96 %arg0, i96 addrspace(1)* undef
   ret void
 }
@@ -481,14 +535,16 @@ define void @void_func_i96(i96 %arg0) #0 {
 define void @void_func_p0i8(i8* %arg0) #0 {
   ; CHECK-LABEL: name: void_func_p0i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[MV]](p0), [[DEF]](p1) :: (store (p0) into `i8* addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store i8* %arg0, i8* addrspace(1)* undef
   ret void
 }
@@ -496,14 +552,16 @@ define void @void_func_p0i8(i8* %arg0) #0 {
 define void @void_func_p1i8(i8 addrspace(1)* %arg0) #0 {
   ; CHECK-LABEL: name: void_func_p1i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[MV]](p1), [[DEF]](p1) :: (store (p1) into `i8 addrspace(1)* addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store i8 addrspace(1)* %arg0, i8 addrspace(1)* addrspace(1)* undef
   ret void
 }
@@ -511,13 +569,15 @@ define void @void_func_p1i8(i8 addrspace(1)* %arg0) #0 {
 define void @void_func_f16(half %arg0) #0 {
   ; CHECK-LABEL: name: void_func_f16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](s16), [[DEF]](p1) :: (store (s16) into `half addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store half %arg0, half addrspace(1)* undef
   ret void
 }
@@ -525,12 +585,14 @@ define void @void_func_f16(half %arg0) #0 {
 define void @void_func_f32(float %arg0) #0 {
   ; CHECK-LABEL: name: void_func_f32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[COPY]](s32), [[DEF]](p1) :: (store (s32) into `float addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store float %arg0, float addrspace(1)* undef
   ret void
 }
@@ -538,14 +600,16 @@ define void @void_func_f32(float %arg0) #0 {
 define void @void_func_f64(double %arg0) #0 {
   ; CHECK-LABEL: name: void_func_f64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[MV]](s64), [[DEF]](p1) :: (store (s64) into `double addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store double %arg0, double addrspace(1)* undef
   ret void
 }
@@ -553,14 +617,16 @@ define void @void_func_f64(double %arg0) #0 {
 define void @void_func_v2i32(<2 x i32> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x s32>), [[DEF]](p1) :: (store (<2 x s32>) into `<2 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store <2 x i32> %arg0, <2 x i32> addrspace(1)* undef
   ret void
 }
@@ -568,15 +634,17 @@ define void @void_func_v2i32(<2 x i32> %arg0) #0 {
 define void @void_func_v2i24(<2 x i24> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2i24
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(<2 x s24>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](<2 x s24>), [[DEF]](p1) :: (store (<2 x s24>) into `<2 x i24> addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store <2 x i24> %arg0, <2 x i24> addrspace(1)* undef
   ret void
 }
@@ -584,16 +652,18 @@ define void @void_func_v2i24(<2 x i24> %arg0) #0 {
 define void @void_func_v3i24(<3 x i24> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v3i24
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(<3 x s24>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC]](<3 x s24>), [[DEF]](p1) :: (store (<3 x s24>) into `<3 x i24> addrspace(1)* undef`, align 16, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   store <3 x i24> %arg0, <3 x i24> addrspace(1)* undef
   ret void
 }
@@ -601,7 +671,7 @@ define void @void_func_v3i24(<3 x i24> %arg0) #0 {
 define void @void_func_v2i8(<2 x i8> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -609,9 +679,11 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 {
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[BUILD_VECTOR]](<2 x s16>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC2]](<2 x s8>), [[DEF]](p1) :: (store (<2 x s8>) into `<2 x i8> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store <2 x i8> %arg0, <2 x i8> addrspace(1)* undef
   ret void
 }
@@ -619,7 +691,7 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 {
 define void @void_func_v3i8(<3 x i8> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v3i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -629,9 +701,11 @@ define void @void_func_v3i8(<3 x i8> %arg0) #0 {
   ; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16)
   ; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[BUILD_VECTOR]](<3 x s16>)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC3]](<3 x s8>), [[DEF]](p1) :: (store (<3 x s8>) into `<3 x i8> addrspace(1)* undef`, align 4, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   store <3 x i8> %arg0, <3 x i8> addrspace(1)* undef
   ret void
 }
@@ -639,7 +713,7 @@ define void @void_func_v3i8(<3 x i8> %arg0) #0 {
 define void @void_func_v4i8(<4 x i8> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v4i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -651,9 +725,11 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
   ; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; CHECK-NEXT:   [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s16>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC4]](<4 x s8>), [[DEF]](p1) :: (store (<4 x s8>) into `<4 x i8> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]]
   store <4 x i8> %arg0, <4 x i8> addrspace(1)* undef
   ret void
 }
@@ -661,14 +737,16 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
 define void @void_func_v2p3i8(<2 x i8 addrspace(3)*> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2p3i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY]](p3), [[COPY1]](p3)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `<2 x i8 addrspace(3)*> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store <2 x i8 addrspace(3)*> %arg0, <2 x i8 addrspace(3)*> addrspace(1)* undef
   ret void
 }
@@ -676,15 +754,17 @@ define void @void_func_v2p3i8(<2 x i8 addrspace(3)*> %arg0) #0 {
 define void @void_func_v3i32(<3 x i32> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v3i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (store (<3 x s32>) into `<3 x i32> addrspace(1)* undef`, align 16, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef
   ret void
 }
@@ -692,16 +772,18 @@ define void @void_func_v3i32(<3 x i32> %arg0) #0 {
 define void @void_func_v4i32(<4 x i32> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v4i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]]
   store <4 x i32> %arg0, <4 x i32> addrspace(1)* undef
   ret void
 }
@@ -709,7 +791,7 @@ define void @void_func_v4i32(<4 x i32> %arg0) #0 {
 define void @void_func_v5i32(<5 x i32> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v5i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -717,9 +799,11 @@ define void @void_func_v5i32(<5 x i32> %arg0) #0 {
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32)
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (store (<5 x s32>) into `<5 x i32> addrspace(1)* undef`, align 32, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY5]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY6]]
   store <5 x i32> %arg0, <5 x i32> addrspace(1)* undef
   ret void
 }
@@ -727,7 +811,7 @@ define void @void_func_v5i32(<5 x i32> %arg0) #0 {
 define void @void_func_v8i32(<8 x i32> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v8i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -738,9 +822,11 @@ define void @void_func_v8i32(<8 x i32> %arg0) #0 {
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<8 x s32>), [[DEF]](p1) :: (store (<8 x s32>) into `<8 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY9]]
   store <8 x i32> %arg0, <8 x i32> addrspace(1)* undef
   ret void
 }
@@ -748,7 +834,7 @@ define void @void_func_v8i32(<8 x i32> %arg0) #0 {
 define void @void_func_v16i32(<16 x i32> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v16i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -767,9 +853,11 @@ define void @void_func_v16i32(<16 x i32> %arg0) #0 {
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<16 x s32>), [[DEF]](p1) :: (store (<16 x s32>) into `<16 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY16]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY17]]
   store <16 x i32> %arg0, <16 x i32> addrspace(1)* undef
   ret void
 }
@@ -777,7 +865,7 @@ define void @void_func_v16i32(<16 x i32> %arg0) #0 {
 define void @void_func_v32i32(<32 x i32> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v32i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -813,9 +901,11 @@ define void @void_func_v32i32(<32 x i32> %arg0) #0 {
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY32]]
   store <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   ret void
 }
@@ -824,7 +914,7 @@ define void @void_func_v32i32(<32 x i32> %arg0) #0 {
 define void @void_func_v33i32(<33 x i32> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v33i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -862,9 +952,11 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
   ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.0, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<33 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32), [[LOAD1]](s32)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<33 x s32>), [[DEF]](p1) :: (store (<33 x s32>) into `<33 x i32> addrspace(1)* undef`, align 256, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY32]]
   store <33 x i32> %arg0, <33 x i32> addrspace(1)* undef
   ret void
 }
@@ -872,7 +964,7 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
 define void @void_func_v2i64(<2 x i64> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -881,9 +973,11 @@ define void @void_func_v2i64(<2 x i64> %arg0) #0 {
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<2 x i64> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]]
   store <2 x i64> %arg0, <2 x i64> addrspace(1)* undef
   ret void
 }
@@ -891,7 +985,7 @@ define void @void_func_v2i64(<2 x i64> %arg0) #0 {
 define void @void_func_v2p0i8(<2 x i8*> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2p0i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -900,9 +994,11 @@ define void @void_func_v2p0i8(<2 x i8*> %arg0) #0 {
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[MV]](p0), [[MV1]](p0)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x p0>), [[DEF]](p1) :: (store (<2 x p0>) into `<2 x i8*> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]]
   store <2 x i8*> %arg0, <2 x i8*> addrspace(1)* undef
   ret void
 }
@@ -910,7 +1006,7 @@ define void @void_func_v2p0i8(<2 x i8*> %arg0) #0 {
 define void @void_func_v2p1i8(<2 x i8 addrspace(1)*> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2p1i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -919,9 +1015,11 @@ define void @void_func_v2p1i8(<2 x i8 addrspace(1)*> %arg0) #0 {
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `<2 x i8 addrspace(1)*> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]]
   store <2 x i8 addrspace(1)*> %arg0, <2 x i8 addrspace(1)*> addrspace(1)* undef
   ret void
 }
@@ -929,7 +1027,7 @@ define void @void_func_v2p1i8(<2 x i8 addrspace(1)*> %arg0) #0 {
 define void @void_func_v3i64(<3 x i64> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v3i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -941,9 +1039,11 @@ define void @void_func_v3i64(<3 x i64> %arg0) #0 {
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<3 x s64>), [[DEF]](p1) :: (store (<3 x s64>) into `<3 x i64> addrspace(1)* undef`, align 32, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY7]]
   store <3 x i64> %arg0, <3 x i64> addrspace(1)* undef
   ret void
 }
@@ -951,7 +1051,7 @@ define void @void_func_v3i64(<3 x i64> %arg0) #0 {
 define void @void_func_v4i64(<4 x i64> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v4i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -966,9 +1066,11 @@ define void @void_func_v4i64(<4 x i64> %arg0) #0 {
   ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK-NEXT:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<4 x s64>), [[DEF]](p1) :: (store (<4 x s64>) into `<4 x i64> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY9]]
   store <4 x i64> %arg0, <4 x i64> addrspace(1)* undef
   ret void
 }
@@ -976,7 +1078,7 @@ define void @void_func_v4i64(<4 x i64> %arg0) #0 {
 define void @void_func_v5i64(<5 x i64> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v5i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -994,9 +1096,11 @@ define void @void_func_v5i64(<5 x i64> %arg0) #0 {
   ; CHECK-NEXT:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK-NEXT:   [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64)
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<5 x s64>), [[DEF]](p1) :: (store (<5 x s64>) into `<5 x i64> addrspace(1)* undef`, align 64, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY11]]
   store <5 x i64> %arg0, <5 x i64> addrspace(1)* undef
   ret void
 }
@@ -1004,7 +1108,7 @@ define void @void_func_v5i64(<5 x i64> %arg0) #0 {
 define void @void_func_v8i64(<8 x i64> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v8i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1031,9 +1135,11 @@ define void @void_func_v8i64(<8 x i64> %arg0) #0 {
   ; CHECK-NEXT:   [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
   ; CHECK-NEXT:   [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<8 x s64>), [[DEF]](p1) :: (store (<8 x s64>) into `<8 x i64> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY16]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY17]]
   store <8 x i64> %arg0, <8 x i64> addrspace(1)* undef
   ret void
 }
@@ -1041,7 +1147,7 @@ define void @void_func_v8i64(<8 x i64> %arg0) #0 {
 define void @void_func_v16i64(<16 x i64> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v16i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1093,9 +1199,11 @@ define void @void_func_v16i64(<16 x i64> %arg0) #0 {
   ; CHECK-NEXT:   [[MV14:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY28]](s32), [[COPY29]](s32)
   ; CHECK-NEXT:   [[MV15:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY30]](s32), [[LOAD]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64), [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64), [[MV12]](s64), [[MV13]](s64), [[MV14]](s64), [[MV15]](s64)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<16 x s64>), [[DEF]](p1) :: (store (<16 x s64>) into `<16 x i64> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY32]]
   store <16 x i64> %arg0, <16 x i64> addrspace(1)* undef
   ret void
 }
@@ -1103,12 +1211,14 @@ define void @void_func_v16i64(<16 x i64> %arg0) #0 {
 define void @void_func_v2i16(<2 x i16> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[COPY]](<2 x s16>), [[DEF]](p1) :: (store (<2 x s16>) into `<2 x i16> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store <2 x i16> %arg0, <2 x i16> addrspace(1)* undef
   ret void
 }
@@ -1116,16 +1226,18 @@ define void @void_func_v2i16(<2 x i16> %arg0) #0 {
 define void @void_func_v3i16(<3 x i16> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v3i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[UV]](<3 x s16>), [[DEF1]](p1) :: (store (<3 x s16>) into `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef
   ret void
 }
@@ -1133,14 +1245,16 @@ define void @void_func_v3i16(<3 x i16> %arg0) #0 {
 define void @void_func_v4i16(<4 x i16> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v4i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (store (<4 x s16>) into `<4 x i16> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store <4 x i16> %arg0, <4 x i16> addrspace(1)* undef
   ret void
 }
@@ -1148,7 +1262,7 @@ define void @void_func_v4i16(<4 x i16> %arg0) #0 {
 define void @void_func_v5i16(<5 x i16> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v5i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1156,9 +1270,11 @@ define void @void_func_v5i16(<5 x i16> %arg0) #0 {
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<5 x s16>), [[UV1:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<10 x s16>)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[UV]](<5 x s16>), [[DEF1]](p1) :: (store (<5 x s16>) into `<5 x i16> addrspace(1)* undef`, align 16, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef
   ret void
 }
@@ -1166,16 +1282,18 @@ define void @void_func_v5i16(<5 x i16> %arg0) #0 {
 define void @void_func_v8i16(<8 x i16> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v8i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
   ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[CONCAT_VECTORS]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `<8 x i16> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]]
   store <8 x i16> %arg0, <8 x i16> addrspace(1)* undef
   ret void
 }
@@ -1183,7 +1301,7 @@ define void @void_func_v8i16(<8 x i16> %arg0) #0 {
 define void @void_func_v16i16(<16 x i16> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v16i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1194,9 +1312,11 @@ define void @void_func_v16i16(<16 x i16> %arg0) #0 {
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr6
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr7
   ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[CONCAT_VECTORS]](<16 x s16>), [[DEF]](p1) :: (store (<16 x s16>) into `<16 x i16> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY9]]
   store <16 x i16> %arg0, <16 x i16> addrspace(1)* undef
   ret void
 }
@@ -1206,7 +1326,7 @@ define void @void_func_v16i16(<16 x i16> %arg0) #0 {
 define void @void_func_v65i16(<65 x i16> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v65i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1246,9 +1366,11 @@ define void @void_func_v65i16(<65 x i16> %arg0) #0 {
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<130 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>), [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[COPY12]](<2 x s16>), [[COPY13]](<2 x s16>), [[COPY14]](<2 x s16>), [[COPY15]](<2 x s16>), [[COPY16]](<2 x s16>), [[COPY17]](<2 x s16>), [[COPY18]](<2 x s16>), [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>), [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[COPY23]](<2 x s16>), [[COPY24]](<2 x s16>), [[COPY25]](<2 x s16>), [[COPY26]](<2 x s16>), [[COPY27]](<2 x s16>), [[COPY28]](<2 x s16>), [[COPY29]](<2 x s16>), [[COPY30]](<2 x s16>), [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<65 x s16>), [[UV1:%[0-9]+]]:_(<65 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<130 x s16>)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[UV]](<65 x s16>), [[DEF1]](p1) :: (store (<65 x s16>) into `<65 x i16> addrspace(1)* undef`, align 256, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY32]]
   store <65 x i16> %arg0, <65 x i16> addrspace(1)* undef
   ret void
 }
@@ -1256,14 +1378,16 @@ define void @void_func_v65i16(<65 x i16> %arg0) #0 {
 define void @void_func_v2f32(<2 x float> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2f32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x s32>), [[DEF]](p1) :: (store (<2 x s32>) into `<2 x float> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store <2 x float> %arg0, <2 x float> addrspace(1)* undef
   ret void
 }
@@ -1271,15 +1395,17 @@ define void @void_func_v2f32(<2 x float> %arg0) #0 {
 define void @void_func_v3f32(<3 x float> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v3f32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (store (<3 x s32>) into `<3 x float> addrspace(1)* undef`, align 16, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   store <3 x float> %arg0, <3 x float> addrspace(1)* undef
   ret void
 }
@@ -1287,16 +1413,18 @@ define void @void_func_v3f32(<3 x float> %arg0) #0 {
 define void @void_func_v4f32(<4 x float> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v4f32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]]
   store <4 x float> %arg0, <4 x float> addrspace(1)* undef
   ret void
 }
@@ -1304,7 +1432,7 @@ define void @void_func_v4f32(<4 x float> %arg0) #0 {
 define void @void_func_v8f32(<8 x float> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v8f32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1315,9 +1443,11 @@ define void @void_func_v8f32(<8 x float> %arg0) #0 {
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<8 x s32>), [[DEF]](p1) :: (store (<8 x s32>) into `<8 x float> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY9]]
   store <8 x float> %arg0, <8 x float> addrspace(1)* undef
   ret void
 }
@@ -1325,7 +1455,7 @@ define void @void_func_v8f32(<8 x float> %arg0) #0 {
 define void @void_func_v16f32(<16 x float> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v16f32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1344,9 +1474,11 @@ define void @void_func_v16f32(<16 x float> %arg0) #0 {
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<16 x s32>), [[DEF]](p1) :: (store (<16 x s32>) into `<16 x float> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY16]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY17]]
   store <16 x float> %arg0, <16 x float> addrspace(1)* undef
   ret void
 }
@@ -1354,7 +1486,7 @@ define void @void_func_v16f32(<16 x float> %arg0) #0 {
 define void @void_func_v2f64(<2 x double> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2f64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1363,9 +1495,11 @@ define void @void_func_v2f64(<2 x double> %arg0) #0 {
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<2 x double> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]]
   store <2 x double> %arg0, <2 x double> addrspace(1)* undef
   ret void
 }
@@ -1373,7 +1507,7 @@ define void @void_func_v2f64(<2 x double> %arg0) #0 {
 define void @void_func_v3f64(<3 x double> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v3f64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1385,9 +1519,11 @@ define void @void_func_v3f64(<3 x double> %arg0) #0 {
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<3 x s64>), [[DEF]](p1) :: (store (<3 x s64>) into `<3 x double> addrspace(1)* undef`, align 32, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY7]]
   store <3 x double> %arg0, <3 x double> addrspace(1)* undef
   ret void
 }
@@ -1395,7 +1531,7 @@ define void @void_func_v3f64(<3 x double> %arg0) #0 {
 define void @void_func_v4f64(<4 x double> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v4f64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1410,9 +1546,11 @@ define void @void_func_v4f64(<4 x double> %arg0) #0 {
   ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK-NEXT:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<4 x s64>), [[DEF]](p1) :: (store (<4 x s64>) into `<4 x double> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY9]]
   store <4 x double> %arg0, <4 x double> addrspace(1)* undef
   ret void
 }
@@ -1420,7 +1558,7 @@ define void @void_func_v4f64(<4 x double> %arg0) #0 {
 define void @void_func_v8f64(<8 x double> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v8f64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1447,9 +1585,11 @@ define void @void_func_v8f64(<8 x double> %arg0) #0 {
   ; CHECK-NEXT:   [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
   ; CHECK-NEXT:   [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<8 x s64>), [[DEF]](p1) :: (store (<8 x s64>) into `<8 x double> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY16]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY17]]
   store <8 x double> %arg0, <8 x double> addrspace(1)* undef
   ret void
 }
@@ -1457,7 +1597,7 @@ define void @void_func_v8f64(<8 x double> %arg0) #0 {
 define void @void_func_v16f64(<16 x double> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v16f64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1509,9 +1649,11 @@ define void @void_func_v16f64(<16 x double> %arg0) #0 {
   ; CHECK-NEXT:   [[MV14:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY28]](s32), [[COPY29]](s32)
   ; CHECK-NEXT:   [[MV15:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY30]](s32), [[LOAD]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64), [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64), [[MV12]](s64), [[MV13]](s64), [[MV14]](s64), [[MV15]](s64)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<16 x s64>), [[DEF]](p1) :: (store (<16 x s64>) into `<16 x double> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY32]]
   store <16 x double> %arg0, <16 x double> addrspace(1)* undef
   ret void
 }
@@ -1519,12 +1661,14 @@ define void @void_func_v16f64(<16 x double> %arg0) #0 {
 define void @void_func_v2f16(<2 x half> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2f16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[COPY]](<2 x s16>), [[DEF]](p1) :: (store (<2 x s16>) into `<2 x half> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store <2 x half> %arg0, <2 x half> addrspace(1)* undef
   ret void
 }
@@ -1532,16 +1676,18 @@ define void @void_func_v2f16(<2 x half> %arg0) #0 {
 define void @void_func_v3f16(<3 x half> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v3f16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[UV]](<3 x s16>), [[DEF1]](p1) :: (store (<3 x s16>) into `<3 x half> addrspace(1)* undef`, align 8, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store <3 x half> %arg0, <3 x half> addrspace(1)* undef
   ret void
 }
@@ -1549,14 +1695,16 @@ define void @void_func_v3f16(<3 x half> %arg0) #0 {
 define void @void_func_v4f16(<4 x half> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v4f16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (store (<4 x s16>) into `<4 x half> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store <4 x half> %arg0, <4 x half> addrspace(1)* undef
   ret void
 }
@@ -1564,16 +1712,18 @@ define void @void_func_v4f16(<4 x half> %arg0) #0 {
 define void @void_func_v8f16(<8 x half> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v8f16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
   ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[CONCAT_VECTORS]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `<8 x half> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]]
   store <8 x half> %arg0, <8 x half> addrspace(1)* undef
   ret void
 }
@@ -1581,7 +1731,7 @@ define void @void_func_v8f16(<8 x half> %arg0) #0 {
 define void @void_func_v16f16(<16 x half> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v16f16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
@@ -1592,9 +1742,11 @@ define void @void_func_v16f16(<16 x half> %arg0) #0 {
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr6
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr7
   ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[CONCAT_VECTORS]](<16 x s16>), [[DEF]](p1) :: (store (<16 x s16>) into `<16 x half> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY9]]
   store <16 x half> %arg0, <16 x half> addrspace(1)* undef
   ret void
 }
@@ -1603,19 +1755,21 @@ define void @void_func_v16f16(<16 x half> %arg0) #0 {
 define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
   ; CHECK-LABEL: name: void_func_i32_i64_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   G_STORE [[COPY]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[MV]](s64), [[COPY4]](p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[MV]](s64), [[COPY5]](p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[COPY3]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY6]]
   store volatile i32 %arg0, i32 addrspace(1)* undef
   store volatile i64 %arg1, i64 addrspace(1)* undef
   store volatile i32 %arg2, i32 addrspace(1)* undef
@@ -1625,12 +1779,14 @@ define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
 define void @void_func_struct_i32({ i32 } %arg0) #0 {
   ; CHECK-LABEL: name: void_func_struct_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[COPY]](s32), [[DEF]](p1) :: (store (s32) into `{ i32 } addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   store { i32 } %arg0, { i32 } addrspace(1)* undef
   ret void
 }
@@ -1638,18 +1794,20 @@ define void @void_func_struct_i32({ i32 } %arg0) #0 {
 define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
   ; CHECK-LABEL: name: void_func_struct_i8_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (store (s8) into `{ i8, i32 } addrspace(1)* undef`, align 4, addrspace 1)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C]](s64)
   ; CHECK-NEXT:   G_STORE [[COPY1]](s32), [[PTR_ADD]](p1) :: (store (s32) into `{ i8, i32 } addrspace(1)* undef` + 4, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   store { i8, i32 } %arg0, { i8, i32 } addrspace(1)* undef
   ret void
 }
@@ -1657,8 +1815,11 @@ define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
 define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0) #0 {
   ; CHECK-LABEL: name: void_func_byval_struct_i8_i32
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX]](p5)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p5) :: (dereferenceable load (s8) from %ir.arg0, align 4, addrspace 5)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
@@ -1668,7 +1829,8 @@ define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8,
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C1]](s64)
   ; CHECK-NEXT:   G_STORE [[LOAD1]](s32), [[PTR_ADD1]](p1) :: (store (s32) into `{ i8, i32 } addrspace(1)* undef` + 4, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
   %arg0.load = load { i8, i32 }, { i8, i32 } addrspace(5)* %arg0
   store { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
   ret void
@@ -1677,13 +1839,14 @@ define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8,
 define void @void_func_byval_struct_i8_i32_x2({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0, { i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg1, i32 %arg2) #0 {
   ; CHECK-LABEL: name: void_func_byval_struct_i8_i32_x2
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX]](p5)
   ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX1]](p5)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p5) :: (volatile dereferenceable load (s8) from %ir.arg0, align 4, addrspace 5)
@@ -1701,7 +1864,8 @@ define void @void_func_byval_struct_i8_i32_x2({ i8, i32 } addrspace(5)* byval({
   ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C1]](s64)
   ; CHECK-NEXT:   G_STORE [[LOAD3]](s32), [[PTR_ADD3]](p1) :: (volatile store (s32) into `{ i8, i32 } addrspace(1)* undef` + 4, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[COPY2]](s32), [[DEF1]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   %arg0.load = load volatile { i8, i32 }, { i8, i32 } addrspace(5)* %arg0
   %arg1.load = load volatile { i8, i32 }, { i8, i32 } addrspace(5)* %arg1
   store volatile { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
@@ -1713,17 +1877,21 @@ define void @void_func_byval_struct_i8_i32_x2({ i8, i32 } addrspace(5)* byval({
 define void @void_func_byval_i32_byval_i64(i32 addrspace(5)* byval(i32) %arg0, i64 addrspace(5)* byval(i64) %arg1) #0 {
   ; CHECK-LABEL: name: void_func_byval_i32_byval_i64
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX]](p5)
   ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX1]](p5)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (dereferenceable load (s32) from %ir.arg0, addrspace 5)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[COPY1]](p5) :: (dereferenceable load (s64) from %ir.arg1, addrspace 5)
   ; CHECK-NEXT:   G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD1]](s64), [[COPY2]](p1) :: (store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[LOAD1]](s64), [[COPY3]](p1) :: (store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   %arg0.load = load i32, i32 addrspace(5)* %arg0
   %arg1.load = load i64, i64 addrspace(5)* %arg1
   store i32 %arg0.load, i32 addrspace(1)* undef
@@ -1734,17 +1902,21 @@ define void @void_func_byval_i32_byval_i64(i32 addrspace(5)* byval(i32) %arg0, i
 define void @void_func_byval_i8_align32_i16_align64(i8 addrspace(5)* byval(i8) %arg0, i16 addrspace(5)* byval(i16) align 64 %arg1) #0 {
   ; CHECK-LABEL: name: void_func_byval_i8_align32_i16_align64
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX]](p5)
   ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX1]](p5)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p1) = COPY [[C]](p1)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p1) = COPY [[C]](p1)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p5) :: (dereferenceable load (s8) from %ir.arg0, addrspace 5)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[COPY1]](p5) :: (dereferenceable load (s16) from %ir.arg1, addrspace 5)
   ; CHECK-NEXT:   G_STORE [[LOAD]](s8), [[C]](p1) :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD1]](s16), [[COPY2]](p1) :: (store (s16) into `i16 addrspace(1)* null`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[LOAD1]](s16), [[COPY3]](p1) :: (store (s16) into `i16 addrspace(1)* null`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   %arg0.load = load i8, i8 addrspace(5)* %arg0
   %arg1.load = load i16, i16 addrspace(5)* %arg1
   store i8 %arg0.load, i8 addrspace(1)* null
@@ -1756,12 +1928,15 @@ define void @void_func_byval_i8_align32_i16_align64(i8 addrspace(5)* byval(i8) %
 define void @byval_a3i32_align128_byval_i16_align64([3 x i32] addrspace(5)* byval([3 x i32]) align 128 %arg0, i16 addrspace(5)* byval(i16) align 64 %arg1) #0 {
   ; CHECK-LABEL: name: byval_a3i32_align128_byval_i16_align64
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX]](p5)
   ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX1]](p5)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p1) = COPY [[C]](p1)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(p1) = COPY [[C]](p1)
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (dereferenceable load (s32) from %ir.arg0, addrspace 5)
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
@@ -1777,8 +1952,9 @@ define void @byval_a3i32_align128_byval_i16_align64([3 x i32] addrspace(5)* byva
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[C]], [[C4]](s64)
   ; CHECK-NEXT:   G_STORE [[LOAD2]](s32), [[PTR_ADD3]](p1) :: (store (s32) into `[3 x i32] addrspace(1)* null` + 8, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD3]](s16), [[COPY2]](p1) :: (store (s16) into `i16 addrspace(1)* null`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[LOAD3]](s16), [[COPY3]](p1) :: (store (s16) into `i16 addrspace(1)* null`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   %arg0.load = load [3 x i32], [3 x i32] addrspace(5)* %arg0
   %arg1.load = load i16, i16 addrspace(5)* %arg1
   store [3 x i32] %arg0.load, [3 x i32] addrspace(1)* null
@@ -1790,7 +1966,7 @@ define void @byval_a3i32_align128_byval_i16_align64([3 x i32] addrspace(5)* byva
 define void @void_func_v32i32_i32_byval_i8(<32 x i32> %arg0, i32 %arg1, i8 addrspace(5)* byval(i8) align 8 %arg2) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_i32_byval_i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1830,12 +2006,14 @@ define void @void_func_v32i32_i32_byval_i8(<32 x i32> %arg0, i32 %arg1, i8 addrs
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.1, addrspace 5)
   ; CHECK-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX2]](p5)
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[C]](p1)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[C]](p1)
   ; CHECK-NEXT:   G_STORE [[LOAD1]](s32), [[C]](p1) :: (store (s32) into `i32 addrspace(1)* null`, addrspace 1)
   ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[COPY31]](p5) :: (dereferenceable load (s8) from %ir.arg2, addrspace 5)
-  ; CHECK-NEXT:   G_STORE [[LOAD2]](s8), [[COPY32]](p1) :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[LOAD2]](s8), [[COPY33]](p1) :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY34]]
   store i32 %arg1, i32 addrspace(1)* null
   %arg2.load = load i8, i8 addrspace(5)* %arg2
   store i8 %arg2.load, i8 addrspace(1)* null
@@ -1846,7 +2024,7 @@ define void @void_func_v32i32_i32_byval_i8(<32 x i32> %arg0, i32 %arg1, i8 addrs
 define void @void_func_v32i32_byval_i8_i32(<32 x i32> %arg0, i8 addrspace(5)* byval(i8) %arg1, i32 %arg2) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_byval_i8_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1886,12 +2064,14 @@ define void @void_func_v32i32_byval_i8_i32(<32 x i32> %arg0, i8 addrspace(5)* by
   ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX1]](p5)
   ; CHECK-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.0, align 8, addrspace 5)
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[C]](p1)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[C]](p1)
   ; CHECK-NEXT:   G_STORE [[LOAD1]](s32), [[C]](p1) :: (store (s32) into `i32 addrspace(1)* null`, addrspace 1)
   ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[COPY31]](p5) :: (dereferenceable load (s8) from %ir.arg1, addrspace 5)
-  ; CHECK-NEXT:   G_STORE [[LOAD2]](s8), [[COPY32]](p1) :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[LOAD2]](s8), [[COPY33]](p1) :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY34]]
   store i32 %arg2, i32 addrspace(1)* null
   %arg1.load = load i8, i8 addrspace(5)* %arg1
   store i8 %arg1.load, i8 addrspace(1)* null
@@ -1901,7 +2081,7 @@ define void @void_func_v32i32_byval_i8_i32(<32 x i32> %arg0, i8 addrspace(5)* by
 define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_i32_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -1944,13 +2124,15 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
   ; CHECK-NEXT:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.0, addrspace 5)
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD1]](s32), [[COPY31]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[MV]](s64), [[COPY32]](p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[LOAD1]](s32), [[COPY32]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[MV]](s64), [[COPY33]](p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY34]]
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile i32 %arg1, i32 addrspace(1)* undef
   store volatile i64 %arg2, i64 addrspace(1)* undef
@@ -1961,7 +2143,7 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_i1_i8_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2007,17 +2189,19 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.1, align 4, addrspace 5)
   ; CHECK-NEXT:   [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[LOAD4:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s16) from %fixed-stack.0, align 16, addrspace 5)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY35:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[COPY31]](p1) :: (volatile store (s1) into `i1 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[TRUNC1]](s8), [[COPY32]](p1) :: (volatile store (s8) into `i8 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD3]](s16), [[COPY33]](p1) :: (volatile store (s16) into `i16 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD4]](s16), [[COPY34]](p1) :: (volatile store (s16) into `half addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[TRUNC]](s1), [[COPY32]](p1) :: (volatile store (s1) into `i1 addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[TRUNC1]](s8), [[COPY33]](p1) :: (volatile store (s8) into `i8 addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[LOAD3]](s16), [[COPY34]](p1) :: (volatile store (s16) into `i16 addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[LOAD4]](s16), [[COPY35]](p1) :: (volatile store (s16) into `half addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY36:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY36]]
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile i1 %arg1, i1 addrspace(1)* undef
   store volatile i8 %arg2, i8 addrspace(1)* undef
@@ -2029,7 +2213,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 define void @void_func_v32i32_p3_p5_i16(<32 x i32> %arg0, i8 addrspace(3)* %arg1, i8 addrspace(5)* %arg2) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_p3_p5_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2069,13 +2253,15 @@ define void @void_func_v32i32_p3_p5_i16(<32 x i32> %arg0, i8 addrspace(3)* %arg1
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (p3) from %fixed-stack.1, addrspace 5)
   ; CHECK-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(p5) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (p5) from %fixed-stack.0, align 8, addrspace 5)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD1]](p3), [[COPY31]](p1) :: (volatile store (p3) into `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD2]](p5), [[COPY32]](p1) :: (volatile store (p5) into `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[LOAD1]](p3), [[COPY32]](p1) :: (volatile store (p3) into `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[LOAD2]](p5), [[COPY33]](p1) :: (volatile store (p5) into `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY34]]
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile i8 addrspace(3)* %arg1, i8 addrspace(3)* addrspace(1)* undef
   store volatile i8 addrspace(5)* %arg2, i8 addrspace(5)* addrspace(1)* undef
@@ -2085,7 +2271,7 @@ define void @void_func_v32i32_p3_p5_i16(<32 x i32> %arg0, i8 addrspace(3)* %arg1
 define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_v2i32_v2f32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2131,13 +2317,15 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
   ; CHECK-NEXT:   [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<2 x s32>), [[COPY31]](p1) :: (volatile store (<2 x s32>) into `<2 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<2 x s32>), [[COPY32]](p1) :: (volatile store (<2 x s32>) into `<2 x float> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<2 x s32>), [[COPY32]](p1) :: (volatile store (<2 x s32>) into `<2 x i32> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<2 x s32>), [[COPY33]](p1) :: (volatile store (<2 x s32>) into `<2 x float> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY34]]
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <2 x i32> %arg1, <2 x i32> addrspace(1)* undef
   store volatile <2 x float> %arg2, <2 x float> addrspace(1)* undef
@@ -2147,7 +2335,7 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_v2i16_v2f16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2187,13 +2375,15 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (<2 x s16>) from %fixed-stack.1, addrspace 5)
   ; CHECK-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (<2 x s16>) from %fixed-stack.0, align 8, addrspace 5)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD1]](<2 x s16>), [[COPY31]](p1) :: (volatile store (<2 x s16>) into `<2 x i16> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[LOAD2]](<2 x s16>), [[COPY32]](p1) :: (volatile store (<2 x s16>) into `<2 x half> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[LOAD1]](<2 x s16>), [[COPY32]](p1) :: (volatile store (<2 x s16>) into `<2 x i16> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[LOAD2]](<2 x s16>), [[COPY33]](p1) :: (volatile store (<2 x s16>) into `<2 x half> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY34]]
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <2 x i16> %arg1, <2 x i16> addrspace(1)* undef
   store volatile <2 x half> %arg2, <2 x half> addrspace(1)* undef
@@ -2203,7 +2393,7 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_v2i64_v2f64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2261,13 +2451,15 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
   ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD5]](s32), [[LOAD6]](s32)
   ; CHECK-NEXT:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD7]](s32), [[LOAD8]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[COPY31]](p1) :: (volatile store (<2 x s64>) into `<2 x i64> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[COPY32]](p1) :: (volatile store (<2 x s64>) into `<2 x double> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[COPY32]](p1) :: (volatile store (<2 x s64>) into `<2 x i64> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[COPY33]](p1) :: (volatile store (<2 x s64>) into `<2 x double> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY34]]
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <2 x i64> %arg1, <2 x i64> addrspace(1)* undef
   store volatile <2 x double> %arg2, <2 x double> addrspace(1)* undef
@@ -2277,7 +2469,7 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_v4i32_v4f32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2331,13 +2523,15 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
   ; CHECK-NEXT:   [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<4 x s32>), [[COPY31]](p1) :: (volatile store (<4 x s32>) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<4 x s32>), [[COPY32]](p1) :: (volatile store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<4 x s32>), [[COPY32]](p1) :: (volatile store (<4 x s32>) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<4 x s32>), [[COPY33]](p1) :: (volatile store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY34]]
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* undef
   store volatile <4 x float> %arg2, <4 x float> addrspace(1)* undef
@@ -2347,7 +2541,7 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
 define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_v8i32_v8f32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2417,13 +2611,15 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
   ; CHECK-NEXT:   [[FRAME_INDEX16:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX16]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32), [[LOAD16]](s32)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<8 x s32>), [[COPY31]](p1) :: (volatile store (<8 x s32>) into `<8 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<8 x s32>), [[COPY32]](p1) :: (volatile store (<8 x s32>) into `<8 x float> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<8 x s32>), [[COPY32]](p1) :: (volatile store (<8 x s32>) into `<8 x i32> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<8 x s32>), [[COPY33]](p1) :: (volatile store (<8 x s32>) into `<8 x float> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY34]]
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <8 x i32> %arg1, <8 x i32> addrspace(1)* undef
   store volatile <8 x float> %arg2, <8 x float> addrspace(1)* undef
@@ -2433,7 +2629,7 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_v16i32_v16f32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2535,13 +2731,15 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
   ; CHECK-NEXT:   [[FRAME_INDEX32:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK-NEXT:   [[LOAD32:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX32]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD17]](s32), [[LOAD18]](s32), [[LOAD19]](s32), [[LOAD20]](s32), [[LOAD21]](s32), [[LOAD22]](s32), [[LOAD23]](s32), [[LOAD24]](s32), [[LOAD25]](s32), [[LOAD26]](s32), [[LOAD27]](s32), [[LOAD28]](s32), [[LOAD29]](s32), [[LOAD30]](s32), [[LOAD31]](s32), [[LOAD32]](s32)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<16 x s32>), [[COPY31]](p1) :: (volatile store (<16 x s32>) into `<16 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<16 x s32>), [[COPY32]](p1) :: (volatile store (<16 x s32>) into `<16 x float> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<16 x s32>), [[COPY32]](p1) :: (volatile store (<16 x s32>) into `<16 x i32> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<16 x s32>), [[COPY33]](p1) :: (volatile store (<16 x s32>) into `<16 x float> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY34]]
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef
   store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef
@@ -2552,26 +2750,28 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
   ; CHECK-LABEL: name: void_func_v3f32_wasted_reg
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(p3) = COPY [[DEF]](p3)
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(p3) = COPY [[DEF]](p3)
   ; CHECK-NEXT:   [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<3 x s32>), [[C]](s32)
   ; CHECK-NEXT:   [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<3 x s32>), [[C1]](s32)
   ; CHECK-NEXT:   [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<3 x s32>), [[C2]](s32)
   ; CHECK-NEXT:   G_STORE [[EVEC]](s32), [[DEF]](p3) :: (volatile store (s32) into `float addrspace(3)* undef`, addrspace 3)
   ; CHECK-NEXT:   G_STORE [[EVEC1]](s32), [[DEF]](p3) :: (volatile store (s32) into `float addrspace(3)* undef`, addrspace 3)
   ; CHECK-NEXT:   G_STORE [[EVEC2]](s32), [[DEF]](p3) :: (volatile store (s32) into `float addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   G_STORE [[COPY3]](s32), [[COPY4]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[COPY3]](s32), [[COPY5]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY6]]
   %arg0.0 = extractelement <3 x float> %arg0, i32 0
   %arg0.1 = extractelement <3 x float> %arg0, i32 1
   %arg0.2 = extractelement <3 x float> %arg0, i32 2
@@ -2585,13 +2785,14 @@ define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
 define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
   ; CHECK-LABEL: name: void_func_v3i32_wasted_reg
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
@@ -2603,7 +2804,8 @@ define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
   ; CHECK-NEXT:   G_STORE [[EVEC1]](s32), [[DEF]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
   ; CHECK-NEXT:   G_STORE [[EVEC2]](s32), [[DEF]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
   ; CHECK-NEXT:   G_STORE [[COPY3]](s32), [[DEF]](p3) :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]]
   %arg0.0 = extractelement <3 x i32> %arg0, i32 0
   %arg0.1 = extractelement <3 x i32> %arg0, i32 1
   %arg0.2 = extractelement <3 x i32> %arg0, i32 2
@@ -2618,7 +2820,7 @@ define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
 define void @void_func_v16i8(<16 x i8> %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v16i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
@@ -2654,9 +2856,11 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
   ; CHECK-NEXT:   [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16), [[TRUNC12]](s16), [[TRUNC13]](s16), [[TRUNC14]](s16), [[TRUNC15]](s16)
   ; CHECK-NEXT:   [[TRUNC16:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[BUILD_VECTOR]](<16 x s16>)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[TRUNC16]](<16 x s8>), [[DEF]](p1) :: (volatile store (<16 x s8>) into `<16 x i8> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY16]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY17]]
   store volatile <16 x i8> %arg0, <16 x i8> addrspace(1)* undef
   ret void
 }
@@ -2665,7 +2869,7 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
 define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_v16i8
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2735,11 +2939,13 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
   ; CHECK-NEXT:   [[LOAD16:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX16]](p5) :: (invariant load (s16) from %fixed-stack.0, align 16, addrspace 5)
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s16>) = G_BUILD_VECTOR [[LOAD1]](s16), [[LOAD2]](s16), [[LOAD3]](s16), [[LOAD4]](s16), [[LOAD5]](s16), [[LOAD6]](s16), [[LOAD7]](s16), [[LOAD8]](s16), [[LOAD9]](s16), [[LOAD10]](s16), [[LOAD11]](s16), [[LOAD12]](s16), [[LOAD13]](s16), [[LOAD14]](s16), [[LOAD15]](s16), [[LOAD16]](s16)
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[BUILD_VECTOR1]](<16 x s16>)
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[TRUNC]](<16 x s8>), [[COPY31]](p1) :: (volatile store (<16 x s8>) into `<16 x i8> addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[TRUNC]](<16 x s8>), [[COPY32]](p1) :: (volatile store (<16 x s8>) into `<16 x i8> addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY33]]
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef
   ret void
@@ -2748,7 +2954,7 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 define void @pointer_in_struct_argument({i8 addrspace(3)*, i8 addrspace(1)*} %arg0, i8 %pad, {i8 addrspace(3)*, i8 addrspace(1234)*} %arg1) {
   ; CHECK-LABEL: name: pointer_in_struct_argument
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2761,16 +2967,18 @@ define void @pointer_in_struct_argument({i8 addrspace(3)*, i8 addrspace(1)*} %ar
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(p1234) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(p1) = COPY [[C]](p1)
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p1) = COPY [[C]](p1)
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p1) = COPY [[C]](p1)
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p1) = COPY [[C]](p1)
   ; CHECK-NEXT:   G_STORE [[COPY]](p3), [[C]](p1) :: (volatile store (p3) into `i8 addrspace(3)* addrspace(1)* null`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[MV]](p1), [[COPY7]](p1) :: (volatile store (p1) into `i8 addrspace(1)* addrspace(1)* null`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[TRUNC1]](s8), [[COPY8]](p1) :: (volatile store (s8) into `i8 addrspace(1)* null`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[MV]](p1), [[COPY8]](p1) :: (volatile store (p1) into `i8 addrspace(1)* addrspace(1)* null`, addrspace 1)
+  ; CHECK-NEXT:   G_STORE [[TRUNC1]](s8), [[COPY9]](p1) :: (volatile store (s8) into `i8 addrspace(1)* null`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[COPY4]](p3), [[C]](p1) :: (volatile store (p3) into `i8 addrspace(3)* addrspace(1)* null`, addrspace 1)
-  ; CHECK-NEXT:   G_STORE [[MV1]](p1234), [[COPY9]](p1) :: (volatile store (p1234) into `i8 addrspace(1234)* addrspace(1)* null`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[MV1]](p1234), [[COPY10]](p1) :: (volatile store (p1234) into `i8 addrspace(1234)* addrspace(1)* null`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY7]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY11]]
   %val0 = extractvalue {i8 addrspace(3)*, i8 addrspace(1)*} %arg0, 0
   %val1 = extractvalue {i8 addrspace(3)*, i8 addrspace(1)*} %arg0, 1
   %val2 = extractvalue {i8 addrspace(3)*, i8 addrspace(1234)*} %arg1, 0
@@ -2786,7 +2994,7 @@ define void @pointer_in_struct_argument({i8 addrspace(3)*, i8 addrspace(1)*} %ar
 define void @vector_ptr_in_struct_arg({ <2 x i8 addrspace(1)*>, <2 x i8 addrspace(3)*> } %arg) {
   ; CHECK-LABEL: name: vector_ptr_in_struct_arg
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -2798,12 +3006,14 @@ define void @vector_ptr_in_struct_arg({ <2 x i8 addrspace(1)*>, <2 x i8 addrspac
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(p3) = COPY $vgpr4
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(p3) = COPY $vgpr5
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY4]](p3), [[COPY5]](p3)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `{ <2 x i8 addrspace(1)*>, <2 x i8 addrspace(3)*> } addrspace(1)* undef`, addrspace 1)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C]](s64)
   ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<2 x p3>), [[PTR_ADD]](p1) :: (store (<2 x p3>) into `{ <2 x i8 addrspace(1)*>, <2 x i8 addrspace(3)*> } addrspace(1)* undef` + 16, align 16, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY7]]
   store { <2 x i8 addrspace(1)*>, <2 x i8 addrspace(3)*> } %arg, { <2 x i8 addrspace(1)*>, <2 x i8 addrspace(3)*> } addrspace(1)* undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll
index 2e660884e7db1..c3cc9ce6e54c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll
@@ -5,7 +5,7 @@
 define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i64(<2 x i32 addrspace(1)*> %ptr, <2 x i64> %idx) {
   ; CHECK-LABEL: name: vector_gep_v2p1_index_v2i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -21,17 +21,19 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i64(<2 x i32 addrspace(1
   ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK-NEXT:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
   ; CHECK-NEXT:   [[MUL:%[0-9]+]]:_(<2 x s64>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(<2 x p1>) = G_PTR_ADD [[BUILD_VECTOR]], [[MUL]](<2 x s64>)
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY8]](<2 x p1>)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>)
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY9]](<2 x p1>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %gep = getelementptr i32, <2 x i32 addrspace(1)*> %ptr, <2 x i64> %idx
   ret <2 x i32 addrspace(1)*> %gep
 }
@@ -40,7 +42,7 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i64(<2 x i32 addrspace(1
 define <2 x i32 addrspace(3)*> @vector_gep_v2p3_index_v2i32(<2 x i32 addrspace(3)*> %ptr, <2 x i32> %idx) {
   ; CHECK-LABEL: name: vector_gep_v2p3_index_v2i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
@@ -48,15 +50,17 @@ define <2 x i32 addrspace(3)*> @vector_gep_v2p3_index_v2i32(<2 x i32 addrspace(3
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
   ; CHECK-NEXT:   [[MUL:%[0-9]+]]:_(<2 x s32>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(<2 x p3>) = G_PTR_ADD [[BUILD_VECTOR]], [[MUL]](<2 x s32>)
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(<2 x p3>) = COPY [[PTR_ADD]](<2 x p3>)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY4]](<2 x p3>)
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<2 x p3>) = COPY [[PTR_ADD]](<2 x p3>)
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY5]](<2 x p3>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY6]], implicit $vgpr0, implicit $vgpr1
   %gep = getelementptr i32, <2 x i32 addrspace(3)*> %ptr, <2 x i32> %idx
   ret <2 x i32 addrspace(3)*> %gep
 }
@@ -65,7 +69,7 @@ define <2 x i32 addrspace(3)*> @vector_gep_v2p3_index_v2i32(<2 x i32 addrspace(3
 define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i32(<2 x i32 addrspace(1)*> %ptr, <2 x i32> %idx) {
   ; CHECK-LABEL: name: vector_gep_v2p1_index_v2i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -77,18 +81,20 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i32(<2 x i32 addrspace(1
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(<2 x s64>) = G_SEXT [[BUILD_VECTOR1]](<2 x s32>)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
   ; CHECK-NEXT:   [[MUL:%[0-9]+]]:_(<2 x s64>) = G_MUL [[SEXT]], [[BUILD_VECTOR2]]
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(<2 x p1>) = G_PTR_ADD [[BUILD_VECTOR]], [[MUL]](<2 x s64>)
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY6]](<2 x p1>)
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>)
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY7]](<2 x p1>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY8]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %gep = getelementptr i32, <2 x i32 addrspace(1)*> %ptr, <2 x i32> %idx
   ret <2 x i32 addrspace(1)*> %gep
 }
@@ -97,7 +103,7 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i32(<2 x i32 addrspace(1
 define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_i64(<2 x i32 addrspace(1)*> %ptr, i64 %idx) {
   ; CHECK-LABEL: name: vector_gep_v2p1_index_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -109,19 +115,21 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_i64(<2 x i32 addrspace(1)*
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV2]](s64)
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<2 x s64>) = COPY [[BUILD_VECTOR1]](<2 x s64>)
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<2 x s64>) = COPY [[BUILD_VECTOR1]](<2 x s64>)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
-  ; CHECK-NEXT:   [[MUL:%[0-9]+]]:_(<2 x s64>) = G_MUL [[COPY6]], [[BUILD_VECTOR2]]
+  ; CHECK-NEXT:   [[MUL:%[0-9]+]]:_(<2 x s64>) = G_MUL [[COPY7]], [[BUILD_VECTOR2]]
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(<2 x p1>) = G_PTR_ADD [[BUILD_VECTOR]], [[MUL]](<2 x s64>)
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY7]](<2 x p1>)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>)
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY8]](<2 x p1>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY9]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %gep = getelementptr i32, <2 x i32 addrspace(1)*> %ptr, i64 %idx
   ret <2 x i32 addrspace(1)*> %gep
 }
@@ -130,7 +138,7 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_i64(<2 x i32 addrspace(1)*
 define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_i32(<2 x i32 addrspace(1)*> %ptr, i32 %idx) {
   ; CHECK-LABEL: name: vector_gep_v2p1_index_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -140,19 +148,21 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_i32(<2 x i32 addrspace(1)*
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY4]](s32)
   ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(<2 x s64>) = G_SEXT [[BUILD_VECTOR1]](<2 x s32>)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
   ; CHECK-NEXT:   [[MUL:%[0-9]+]]:_(<2 x s64>) = G_MUL [[SEXT]], [[BUILD_VECTOR2]]
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(<2 x p1>) = G_PTR_ADD [[BUILD_VECTOR]], [[MUL]](<2 x s64>)
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY5]](<2 x p1>)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>)
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY6]](<2 x p1>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY5]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY7]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %gep = getelementptr i32, <2 x i32 addrspace(1)*> %ptr, i32 %idx
   ret <2 x i32 addrspace(1)*> %gep
 }
@@ -161,7 +171,7 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_i32(<2 x i32 addrspace(1)*
 define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i64_constant(<2 x i32 addrspace(1)*> %ptr, <2 x i64> %idx) {
   ; CHECK-LABEL: name: vector_gep_v2p1_index_v2i64_constant
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -177,6 +187,7 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i64_constant(<2 x i32 ad
   ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK-NEXT:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
   ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64)
@@ -185,13 +196,14 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i64_constant(<2 x i32 ad
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; CHECK-NEXT:   [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C2]](s64), [[C3]](s64)
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(<2 x p1>) = G_PTR_ADD [[BUILD_VECTOR]], [[BUILD_VECTOR4]](<2 x s64>)
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>)
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY8]](<2 x p1>)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(<2 x p1>) = COPY [[PTR_ADD]](<2 x p1>)
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY9]](<2 x p1>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; CHECK-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY10]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
   %gep = getelementptr i32, <2 x i32 addrspace(1)*> %ptr, <2 x i64> <i64 1, i64 2>
   ret <2 x i32 addrspace(1)*> %gep
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
index f0383259abb7b..cc2c95324c2ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
@@ -57,17 +57,19 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(void()* %fptr) {
 define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(void()* %fptr) {
   ; CHECK-LABEL: name: test_gfx_indirect_call_sgpr_ptr
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY4]]
   call amdgpu_gfx void %fptr()
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
index b5d9b33122ba6..d31e73e91b607 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -51,12 +51,16 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() {
 define i32 @asm_vgpr_early_clobber() {
   ; CHECK-LABEL: name: asm_vgpr_early_clobber
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1, !0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %2, !0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
-  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %2
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY2]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[ADD]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   call { i32, i32 } asm sideeffect "v_mov_b32 $0, 7; v_mov_b32 $1, 7", "=&v,=&v"(), !srcloc !0
   %asmresult = extractvalue { i32, i32 } %1, 0
   %asmresult1 = extractvalue { i32, i32 } %1, 1
@@ -67,10 +71,14 @@ define i32 @asm_vgpr_early_clobber() {
 define i32 @test_specific_vgpr_output() nounwind {
   ; CHECK-LABEL: name: test_specific_vgpr_output
   ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   INLINEASM &"v_mov_b32 v1, 7", 0 /* attdialect */, 10 /* regdef */, implicit-def $vgpr1
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
 entry:
   %0 = tail call i32 asm "v_mov_b32 v1, 7", "={v1}"() nounwind
   ret i32 %0
@@ -79,10 +87,14 @@ entry:
 define i32 @test_single_vgpr_output() nounwind {
   ; CHECK-LABEL: name: test_single_vgpr_output
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
-  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
 entry:
   %0 = tail call i32 asm "v_mov_b32 $0, 7", "=v"() nounwind
   ret i32 %0
@@ -91,10 +103,14 @@ entry:
 define i32 @test_single_sgpr_output_s32() nounwind {
   ; CHECK-LABEL: name: test_single_sgpr_output_s32
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
-  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
 entry:
   %0 = tail call i32 asm "s_mov_b32 $0, 7", "=s"() nounwind
   ret i32 %0
@@ -104,12 +120,16 @@ entry:
 define float @test_multiple_register_outputs_same() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_same
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 1835018 /* regdef:VGPR_32 */, def %1
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 1835018 /* regdef:VGPR_32 */, def %2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
-  ; CHECK-NEXT:   [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %2
+  ; CHECK-NEXT:   [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY1]], [[COPY2]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[FADD]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %1 = call { float, float } asm "v_mov_b32 $0, 0; v_mov_b32 $1, 1", "=v,=v"()
   %asmresult = extractvalue { float, float } %1, 0
   %asmresult1 = extractvalue { float, float } %1, 1
@@ -121,13 +141,17 @@ define float @test_multiple_register_outputs_same() #0 {
 define double @test_multiple_register_outputs_mixed() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_mixed
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 2949130 /* regdef:VReg_64 */, def %1
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %1
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2949130 /* regdef:VReg_64 */, def %2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY %2
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0, implicit $vgpr1
   %1 = call { float, double } asm "v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", "=v,=v"()
   %asmresult = extractvalue { float, double } %1, 1
   ret double %asmresult
@@ -137,12 +161,16 @@ define double @test_multiple_register_outputs_mixed() #0 {
 define float @test_vector_output() nounwind {
   ; CHECK-LABEL: name: test_vector_output
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   INLINEASM &"v_add_f64 $0, 0, 0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $vgpr14_vgpr15
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr14_vgpr15
-  ; CHECK-NEXT:   [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C]](s32)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr14_vgpr15
+  ; CHECK-NEXT:   [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY1]](<2 x s32>), [[C]](s32)
   ; CHECK-NEXT:   $vgpr0 = COPY [[EVEC]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
   %1 = tail call <2 x float> asm sideeffect "v_add_f64 $0, 0, 0", "={v[14:15]}"() nounwind
   %2 = extractelement <2 x float> %1, i32 0
   ret float %2
@@ -184,14 +212,16 @@ define amdgpu_kernel void @test_input_imm() {
 define float @test_input_vgpr(i32 %src) nounwind {
   ; CHECK-LABEL: name: test_input_vgpr
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 1835017 /* reguse:VGPR_32 */, [[COPY1]]
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %1
-  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 1835017 /* reguse:VGPR_32 */, [[COPY2]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY %2
+  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0
 entry:
   %0 = tail call float asm "v_add_f32 $0, 1.0, $1", "=v,v"(i32 %src) nounwind
   ret float %0
@@ -200,13 +230,15 @@ entry:
 define i32 @test_memory_constraint(i32 addrspace(3)* %a) nounwind {
   ; CHECK-LABEL: name: test_memory_constraint
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-  ; CHECK-NEXT:   INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 196622 /* mem:m */, [[COPY]](p3)
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
-  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 196622 /* mem:m */, [[COPY]](p3)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %2
+  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %1 = tail call i32 asm "ds_read_b32 $0, $1", "=v,*m"(i32 addrspace(3)* %a)
   ret i32 %1
 }
@@ -214,16 +246,18 @@ define i32 @test_memory_constraint(i32 addrspace(3)* %a) nounwind {
 define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
   ; CHECK-LABEL: name: test_vgpr_matching_constraint
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32)
-  ; CHECK-NEXT:   INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %3, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %3
-  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32)
+  ; CHECK-NEXT:   INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY %4
+  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0
   %and = and i32 %a, 1
   %asm = call i32 asm sideeffect ";", "=v,0"(i32 %and)
   ret i32 %asm
@@ -232,16 +266,20 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
 define i32 @test_sgpr_matching_constraint() nounwind {
   ; CHECK-LABEL: name: test_sgpr_matching_constraint
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %2
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %2
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %3
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %3
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32)
-  ; CHECK-NEXT:   INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %4, 1966089 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY %4
-  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY4]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %5, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3)
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY %5
+  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY5]](s32)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY6]], implicit $vgpr0
 entry:
   %asm0 = tail call i32 asm "s_mov_b32 $0, 7", "=s"() nounwind
   %asm1 = tail call i32 asm "s_mov_b32 $0, 8", "=s"() nounwind
@@ -252,23 +290,25 @@ entry:
 define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
   ; CHECK-LABEL: name: test_many_matching_constraints
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
-  ; CHECK-NEXT:   INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %3, 1835018 /* regdef:VGPR_32 */, def %4, 1835018 /* regdef:VGPR_32 */, def %5, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY %3
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 1835018 /* regdef:VGPR_32 */, def %5, 1835018 /* regdef:VGPR_32 */, def %6, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5)
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY %4
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY %5
-  ; CHECK-NEXT:   G_STORE [[COPY6]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY %6
   ; CHECK-NEXT:   G_STORE [[COPY7]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
   ; CHECK-NEXT:   G_STORE [[COPY8]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   G_STORE [[COPY9]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY10]]
   %asm = call {i32, i32, i32} asm sideeffect "; ", "=v,=v,=v,0,2,1"(i32 %c, i32 %a, i32 %b)
   %asmresult0 = extractvalue  {i32, i32, i32} %asm, 0
   store i32 %asmresult0, i32 addrspace(1)* undef
@@ -282,13 +322,17 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
 define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind {
   ; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %2
-  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   liveins: $sgpr30_sgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %3, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY %3
+  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0
 entry:
   %asm0 = tail call i32 asm "s_mov_b32 $0, 7", "=s"() nounwind
   %asm1 = tail call i32 asm "v_mov_b32 $0, $1", "=v,0"(i32 %asm0) nounwind

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
index 8e90bd593a108..ec903d3462880 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
@@ -6,16 +6,18 @@
 define void @test_memcpy_p1_p3_i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) {
   ; CHECK-LABEL: name: test_memcpy_p1_p3_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
   ; CHECK-NEXT:   G_MEMCPY [[MV]](p1), [[COPY2]](p3), [[TRUNC]](s32), 0 :: (store (s8) into %ir.dst, addrspace 1), (load (s8) from %ir.src, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memcpy.p1i8.p3i8.i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i64 256, i1 false)
   ret void
 }
@@ -23,15 +25,17 @@ define void @test_memcpy_p1_p3_i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src)
 define void @test_memcpy_p1_p3_i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) {
   ; CHECK-LABEL: name: test_memcpy_p1_p3_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
   ; CHECK-NEXT:   G_MEMCPY [[MV]](p1), [[COPY2]](p3), [[C]](s32), 0 :: (store (s8) into %ir.dst, addrspace 1), (load (s8) from %ir.src, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 256, i1 false)
   ret void
 }
@@ -39,16 +43,18 @@ define void @test_memcpy_p1_p3_i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src)
 define void @test_memcpy_p1_p3_i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) {
   ; CHECK-LABEL: name: test_memcpy_p1_p3_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[C]](s16)
   ; CHECK-NEXT:   G_MEMCPY [[MV]](p1), [[COPY2]](p3), [[ZEXT]](s32), 0 :: (store (s8) into %ir.dst, addrspace 1), (load (s8) from %ir.src, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memcpy.p1i8.p3i8.i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i16 256, i1 false)
   ret void
 }
@@ -56,16 +62,18 @@ define void @test_memcpy_p1_p3_i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src)
 define void @test_memcpy_p3_p1_i64(i8 addrspace(3)* %dst, i8 addrspace(1)* %src) {
   ; CHECK-LABEL: name: test_memcpy_p3_p1_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
   ; CHECK-NEXT:   G_MEMCPY [[COPY]](p3), [[MV]](p1), [[TRUNC]](s32), 0 :: (store (s8) into %ir.dst, addrspace 3), (load (s8) from %ir.src, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i64 256, i1 false)
   ret void
 }
@@ -73,15 +81,17 @@ define void @test_memcpy_p3_p1_i64(i8 addrspace(3)* %dst, i8 addrspace(1)* %src)
 define void @test_memcpy_p3_p1_i32(i8 addrspace(3)* %dst, i8 addrspace(1)* %src) {
   ; CHECK-LABEL: name: test_memcpy_p3_p1_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
   ; CHECK-NEXT:   G_MEMCPY [[COPY]](p3), [[MV]](p1), [[C]](s32), 0 :: (store (s8) into %ir.dst, addrspace 3), (load (s8) from %ir.src, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 256, i1 false)
   ret void
 }
@@ -89,16 +99,18 @@ define void @test_memcpy_p3_p1_i32(i8 addrspace(3)* %dst, i8 addrspace(1)* %src)
 define void @test_memcpy_p3_p1_i16(i8 addrspace(3)* %dst, i8 addrspace(1)* %src) {
   ; CHECK-LABEL: name: test_memcpy_p3_p1_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[C]](s16)
   ; CHECK-NEXT:   G_MEMCPY [[COPY]](p3), [[MV]](p1), [[ZEXT]](s32), 0 :: (store (s8) into %ir.dst, addrspace 3), (load (s8) from %ir.src, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memcpy.p3i8.p1i8.i16(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i16 256, i1 false)
   ret void
 }
@@ -106,16 +118,18 @@ define void @test_memcpy_p3_p1_i16(i8 addrspace(3)* %dst, i8 addrspace(1)* %src)
 define void @test_memmove_p1_p3_i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) {
   ; CHECK-LABEL: name: test_memmove_p1_p3_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
   ; CHECK-NEXT:   G_MEMMOVE [[MV]](p1), [[COPY2]](p3), [[TRUNC]](s32), 0 :: (store (s8) into %ir.dst, addrspace 1), (load (s8) from %ir.src, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memmove.p1i8.p3i8.i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i64 256, i1 false)
   ret void
 }
@@ -123,15 +137,17 @@ define void @test_memmove_p1_p3_i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src
 define void @test_memmove_p1_p3_i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) {
   ; CHECK-LABEL: name: test_memmove_p1_p3_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
   ; CHECK-NEXT:   G_MEMMOVE [[MV]](p1), [[COPY2]](p3), [[C]](s32), 0 :: (store (s8) into %ir.dst, addrspace 1), (load (s8) from %ir.src, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 256, i1 false)
   ret void
 }
@@ -139,16 +155,18 @@ define void @test_memmove_p1_p3_i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src
 define void @test_memmove_p1_p3_i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src) {
   ; CHECK-LABEL: name: test_memmove_p1_p3_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[C]](s16)
   ; CHECK-NEXT:   G_MEMMOVE [[MV]](p1), [[COPY2]](p3), [[ZEXT]](s32), 0 :: (store (s8) into %ir.dst, addrspace 1), (load (s8) from %ir.src, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memmove.p1i8.p3i8.i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i16 256, i1 false)
   ret void
 }
@@ -156,16 +174,18 @@ define void @test_memmove_p1_p3_i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src
 define void @test_memset_p1_i64(i8 addrspace(1)* %dst, i8 %val) {
   ; CHECK-LABEL: name: test_memset_p1_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
   ; CHECK-NEXT:   G_MEMSET [[MV]](p1), [[TRUNC]](s8), [[C]](s64), 0 :: (store (s8) into %ir.dst, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 256, i1 false)
   ret void
 }
@@ -173,17 +193,19 @@ define void @test_memset_p1_i64(i8 addrspace(1)* %dst, i8 %val) {
 define void @test_memset_p1_i32(i8 addrspace(1)* %dst, i8 %val) {
   ; CHECK-LABEL: name: test_memset_p1_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[C]](s32)
   ; CHECK-NEXT:   G_MEMSET [[MV]](p1), [[TRUNC]](s8), [[ZEXT]](s64), 0 :: (store (s8) into %ir.dst, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memset.p1i8.i32(i8 addrspace(1)* %dst, i8 %val, i32 256, i1 false)
   ret void
 }
@@ -191,17 +213,19 @@ define void @test_memset_p1_i32(i8 addrspace(1)* %dst, i8 %val) {
 define void @test_memset_p1_i16(i8 addrspace(1)* %dst, i8 %val) {
   ; CHECK-LABEL: name: test_memset_p1_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[C]](s16)
   ; CHECK-NEXT:   G_MEMSET [[MV]](p1), [[TRUNC]](s8), [[ZEXT]](s64), 0 :: (store (s8) into %ir.dst, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
   call void @llvm.memset.p1i8.i16(i8 addrspace(1)* %dst, i8 %val, i16 256, i1 false)
   ret void
 }
@@ -209,15 +233,17 @@ define void @test_memset_p1_i16(i8 addrspace(1)* %dst, i8 %val) {
 define void @test_memset_p3_i64(i8 addrspace(3)* %dst, i8 %val) {
   ; CHECK-LABEL: name: test_memset_p3_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
   ; CHECK-NEXT:   G_MEMSET [[COPY]](p3), [[TRUNC]](s8), [[TRUNC1]](s32), 0 :: (store (s8) into %ir.dst, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %dst, i8 %val, i64 256, i1 false)
   ret void
 }
@@ -225,14 +251,16 @@ define void @test_memset_p3_i64(i8 addrspace(3)* %dst, i8 %val) {
 define void @test_memset_p3_i32(i8 addrspace(3)* %dst, i8 %val) {
   ; CHECK-LABEL: name: test_memset_p3_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
   ; CHECK-NEXT:   G_MEMSET [[COPY]](p3), [[TRUNC]](s8), [[C]](s32), 0 :: (store (s8) into %ir.dst, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %dst, i8 %val, i32 256, i1 false)
   ret void
 }
@@ -240,15 +268,17 @@ define void @test_memset_p3_i32(i8 addrspace(3)* %dst, i8 %val) {
 define void @test_memset_p3_i16(i8 addrspace(3)* %dst, i8 %val) {
   ; CHECK-LABEL: name: test_memset_p3_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[C]](s16)
   ; CHECK-NEXT:   G_MEMSET [[COPY]](p3), [[TRUNC]](s8), [[ZEXT]](s32), 0 :: (store (s8) into %ir.dst, addrspace 3)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
   call void @llvm.memset.p3i8.i16(i8 addrspace(3)* %dst, i8 %val, i16 256, i1 false)
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll
index a1a72c07cb631..99a2875e8b160 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll
@@ -5,12 +5,15 @@
 define i32 @reloc_constant() {
   ; CHECK-LABEL: name: reloc_constant
   ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[INT0:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), !0
   ; We cannot have any specific metadata check here as ConstantAsMetadata is printed as <raw_ptr_val>
   ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), <0x{{[0-9a-f]+}}>
   ; CHECK:   [[SUM:%[0-9]+]]:_(s32) = G_ADD [[INT0]], [[INT1]]
   ; CHECK:   $vgpr0 = COPY [[SUM]](s32)
-  ; CHECK:   SI_RETURN implicit $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY1]], implicit $vgpr0
   %val0 = call i32 @llvm.amdgcn.reloc.constant(metadata !0)
   %val1 = call i32 @llvm.amdgcn.reloc.constant(metadata i32 4)
   %res = add i32 %val0, %val1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
index af0daceac6c61..a88c4224a2fa7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
@@ -4,7 +4,7 @@
 define i8* @ptrmask_flat_i64(i8* %ptr, i64 %mask) {
   ; CHECK-LABEL: name: ptrmask_flat_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -12,11 +12,13 @@ define i8* @ptrmask_flat_i64(i8* %ptr, i64 %mask) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[MV1]](s64)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %ptr, i64 %mask)
   ret i8* %masked
 }
@@ -24,17 +26,19 @@ define i8* @ptrmask_flat_i64(i8* %ptr, i64 %mask) {
 define i8* @ptrmask_flat_i32(i8* %ptr, i32 %mask) {
   ; CHECK-LABEL: name: ptrmask_flat_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[COPY2]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0, implicit $vgpr1
   %masked = call i8* @llvm.ptrmask.p0i8.i32(i8* %ptr, i32 %mask)
   ret i8* %masked
 }
@@ -42,18 +46,20 @@ define i8* @ptrmask_flat_i32(i8* %ptr, i32 %mask) {
 define i8* @ptrmask_flat_i16(i8* %ptr, i16 %mask) {
   ; CHECK-LABEL: name: ptrmask_flat_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[TRUNC]](s16)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0, implicit $vgpr1
   %masked = call i8* @llvm.ptrmask.p0i8.i16(i8* %ptr, i16 %mask)
   ret i8* %masked
 }
@@ -61,18 +67,20 @@ define i8* @ptrmask_flat_i16(i8* %ptr, i16 %mask) {
 define i8* @ptrmask_flat_i1(i8* %ptr, i1 %mask) {
   ; CHECK-LABEL: name: ptrmask_flat_i1
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[TRUNC]](s1)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0, implicit $vgpr1
   %masked = call i8* @llvm.ptrmask.p0i8.i1(i8* %ptr, i1 %mask)
   ret i8* %masked
 }
@@ -80,15 +88,17 @@ define i8* @ptrmask_flat_i1(i8* %ptr, i1 %mask) {
 define i8 addrspace(3)* @ptrmask_local_i64(i8 addrspace(3)* %ptr, i64 %mask) {
   ; CHECK-LABEL: name: ptrmask_local_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[MV]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[PTRMASK]](p3)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0
   %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i64(i8 addrspace(3)* %ptr, i64 %mask)
   ret i8 addrspace(3)* %masked
 }
@@ -96,13 +106,15 @@ define i8 addrspace(3)* @ptrmask_local_i64(i8 addrspace(3)* %ptr, i64 %mask) {
 define i8 addrspace(3)* @ptrmask_local_i32(i8 addrspace(3)* %ptr, i32 %mask) {
   ; CHECK-LABEL: name: ptrmask_local_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[COPY1]](s32)
   ; CHECK-NEXT:   $vgpr0 = COPY [[PTRMASK]](p3)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* %ptr, i32 %mask)
   ret i8 addrspace(3)* %masked
 }
@@ -110,14 +122,16 @@ define i8 addrspace(3)* @ptrmask_local_i32(i8 addrspace(3)* %ptr, i32 %mask) {
 define i8 addrspace(3)* @ptrmask_local_i16(i8 addrspace(3)* %ptr, i16 %mask) {
   ; CHECK-LABEL: name: ptrmask_local_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[TRUNC]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[PTRMASK]](p3)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i16(i8 addrspace(3)* %ptr, i16 %mask)
   ret i8 addrspace(3)* %masked
 }
@@ -125,14 +139,16 @@ define i8 addrspace(3)* @ptrmask_local_i16(i8 addrspace(3)* %ptr, i16 %mask) {
 define i8 addrspace(3)* @ptrmask_local_i1(i8 addrspace(3)* %ptr, i1 %mask) {
   ; CHECK-LABEL: name: ptrmask_local_i1
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[TRUNC]](s1)
   ; CHECK-NEXT:   $vgpr0 = COPY [[PTRMASK]](p3)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i1(i8 addrspace(3)* %ptr, i1 %mask)
   ret i8 addrspace(3)* %masked
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll
index 4faea44d1582e..0db076c6b0fcf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll
@@ -4,16 +4,18 @@
 define i16 @uaddsat_i16(i16 %lhs, i16 %rhs) {
   ; CHECK-LABEL: name: uaddsat_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC1]]
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UADDSAT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %res
 }
@@ -22,13 +24,15 @@ declare i16 @llvm.uadd.sat.i16(i16, i16)
 define i32 @uaddsat_i32(i32 %lhs, i32 %rhs) {
   ; CHECK-LABEL: name: uaddsat_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[COPY]], [[COPY1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[UADDSAT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %res
 }
@@ -37,7 +41,7 @@ declare i32 @llvm.uadd.sat.i32(i32, i32)
 define i64 @uaddsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-LABEL: name: uaddsat_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -45,11 +49,13 @@ define i64 @uaddsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[UADDSAT:%[0-9]+]]:_(s64) = G_UADDSAT [[MV]], [[MV1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UADDSAT]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %res
 }
@@ -58,7 +64,7 @@ declare i64 @llvm.uadd.sat.i64(i64, i64)
 define <2 x i32> @uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-LABEL: name: uaddsat_v2i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -66,11 +72,13 @@ define <2 x i32> @uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[UADDSAT:%[0-9]+]]:_(<2 x s32>) = G_UADDSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UADDSAT]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %res
 }
@@ -79,16 +87,18 @@ declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>)
 define i16 @saddsat_i16(i16 %lhs, i16 %rhs) {
   ; CHECK-LABEL: name: saddsat_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SADDSAT:%[0-9]+]]:_(s16) = G_SADDSAT [[TRUNC]], [[TRUNC1]]
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SADDSAT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %res
 }
@@ -97,13 +107,15 @@ declare i16 @llvm.sadd.sat.i16(i16, i16)
 define i32 @saddsat_i32(i32 %lhs, i32 %rhs) {
   ; CHECK-LABEL: name: saddsat_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SADDSAT:%[0-9]+]]:_(s32) = G_SADDSAT [[COPY]], [[COPY1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[SADDSAT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %res
 }
@@ -112,7 +124,7 @@ declare i32 @llvm.sadd.sat.i32(i32, i32)
 define i64 @saddsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-LABEL: name: saddsat_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -120,11 +132,13 @@ define i64 @saddsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SADDSAT:%[0-9]+]]:_(s64) = G_SADDSAT [[MV]], [[MV1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SADDSAT]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %res
 }
@@ -133,7 +147,7 @@ declare i64 @llvm.sadd.sat.i64(i64, i64)
 define <2 x i32> @saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-LABEL: name: saddsat_v2i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -141,11 +155,13 @@ define <2 x i32> @saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SADDSAT:%[0-9]+]]:_(<2 x s32>) = G_SADDSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SADDSAT]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %res
 }
@@ -154,16 +170,18 @@ declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>)
 define i16 @usubsat_i16(i16 %lhs, i16 %rhs) {
   ; CHECK-LABEL: name: usubsat_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC1]]
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USUBSAT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %res
 }
@@ -172,13 +190,15 @@ declare i16 @llvm.usub.sat.i16(i16, i16)
 define i32 @usubsat_i32(i32 %lhs, i32 %rhs) {
   ; CHECK-LABEL: name: usubsat_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[COPY]], [[COPY1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[USUBSAT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %res
 }
@@ -187,7 +207,7 @@ declare i32 @llvm.usub.sat.i32(i32, i32)
 define i64 @usubsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-LABEL: name: usubsat_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -195,11 +215,13 @@ define i64 @usubsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[USUBSAT:%[0-9]+]]:_(s64) = G_USUBSAT [[MV]], [[MV1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USUBSAT]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %res
 }
@@ -208,7 +230,7 @@ declare i64 @llvm.usub.sat.i64(i64, i64)
 define <2 x i32> @usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-LABEL: name: usubsat_v2i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -216,11 +238,13 @@ define <2 x i32> @usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[USUBSAT:%[0-9]+]]:_(<2 x s32>) = G_USUBSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USUBSAT]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %res
 }
@@ -229,16 +253,18 @@ declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>)
 define i16 @ssubsat_i16(i16 %lhs, i16 %rhs) {
   ; CHECK-LABEL: name: ssubsat_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SSUBSAT:%[0-9]+]]:_(s16) = G_SSUBSAT [[TRUNC]], [[TRUNC1]]
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SSUBSAT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %res
 }
@@ -247,13 +273,15 @@ declare i16 @llvm.ssub.sat.i16(i16, i16)
 define i32 @ssubsat_i32(i32 %lhs, i32 %rhs) {
   ; CHECK-LABEL: name: ssubsat_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SSUBSAT:%[0-9]+]]:_(s32) = G_SSUBSAT [[COPY]], [[COPY1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[SSUBSAT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %res
 }
@@ -262,7 +290,7 @@ declare i32 @llvm.ssub.sat.i32(i32, i32)
 define i64 @ssubsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-LABEL: name: ssubsat_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -270,11 +298,13 @@ define i64 @ssubsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SSUBSAT:%[0-9]+]]:_(s64) = G_SSUBSAT [[MV]], [[MV1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSUBSAT]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %res
 }
@@ -283,7 +313,7 @@ declare i64 @llvm.ssub.sat.i64(i64, i64)
 define <2 x i32> @ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-LABEL: name: ssubsat_v2i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -291,11 +321,13 @@ define <2 x i32> @ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SSUBSAT:%[0-9]+]]:_(<2 x s32>) = G_SSUBSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSUBSAT]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %res
 }
@@ -304,16 +336,18 @@ declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>)
 define i16 @ushlsat_i16(i16 %lhs, i16 %rhs) {
   ; CHECK-LABEL: name: ushlsat_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[USHLSAT:%[0-9]+]]:_(s16) = G_USHLSAT [[TRUNC]], [[TRUNC1]](s16)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USHLSAT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i16 @llvm.ushl.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %res
 }
@@ -322,13 +356,15 @@ declare i16 @llvm.ushl.sat.i16(i16, i16)
 define i32 @ushlsat_i32(i32 %lhs, i32 %rhs) {
   ; CHECK-LABEL: name: ushlsat_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[USHLSAT:%[0-9]+]]:_(s32) = G_USHLSAT [[COPY]], [[COPY1]](s32)
   ; CHECK-NEXT:   $vgpr0 = COPY [[USHLSAT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i32 @llvm.ushl.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %res
 }
@@ -337,7 +373,7 @@ declare i32 @llvm.ushl.sat.i32(i32, i32)
 define i64 @ushlsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-LABEL: name: ushlsat_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -345,11 +381,13 @@ define i64 @ushlsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[USHLSAT:%[0-9]+]]:_(s64) = G_USHLSAT [[MV]], [[MV1]](s64)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USHLSAT]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call i64 @llvm.ushl.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %res
 }
@@ -358,7 +396,7 @@ declare i64 @llvm.ushl.sat.i64(i64, i64)
 define <2 x i32> @ushlsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-LABEL: name: ushlsat_v2i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -366,11 +404,13 @@ define <2 x i32> @ushlsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[USHLSAT:%[0-9]+]]:_(<2 x s32>) = G_USHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s32>)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USHLSAT]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call <2 x i32> @llvm.ushl.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %res
 }
@@ -379,16 +419,18 @@ declare <2 x i32> @llvm.ushl.sat.v2i32(<2 x i32>, <2 x i32>)
 define i16 @sshlsat_i16(i16 %lhs, i16 %rhs) {
   ; CHECK-LABEL: name: sshlsat_i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SSHLSAT:%[0-9]+]]:_(s16) = G_SSHLSAT [[TRUNC]], [[TRUNC1]](s16)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SSHLSAT]](s16)
   ; CHECK-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i16 @llvm.sshl.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %res
 }
@@ -397,13 +439,15 @@ declare i16 @llvm.sshl.sat.i16(i16, i16)
 define i32 @sshlsat_i32(i32 %lhs, i32 %rhs) {
   ; CHECK-LABEL: name: sshlsat_i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SSHLSAT:%[0-9]+]]:_(s32) = G_SSHLSAT [[COPY]], [[COPY1]](s32)
   ; CHECK-NEXT:   $vgpr0 = COPY [[SSHLSAT]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %res = call i32 @llvm.sshl.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %res
 }
@@ -412,7 +456,7 @@ declare i32 @llvm.sshl.sat.i32(i32, i32)
 define i64 @sshlsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-LABEL: name: sshlsat_i64
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -420,11 +464,13 @@ define i64 @sshlsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SSHLSAT:%[0-9]+]]:_(s64) = G_SSHLSAT [[MV]], [[MV1]](s64)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSHLSAT]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call i64 @llvm.sshl.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %res
 }
@@ -433,7 +479,7 @@ declare i64 @llvm.sshl.sat.i64(i64, i64)
 define <2 x i32> @sshlsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-LABEL: name: sshlsat_v2i32
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -441,11 +487,13 @@ define <2 x i32> @sshlsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[SSHLSAT:%[0-9]+]]:_(<2 x s32>) = G_SSHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s32>)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSHLSAT]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
   %res = call <2 x i32> @llvm.sshl.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %res
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
index 34925828c2d13..d6da4fca21980 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
@@ -5,13 +5,15 @@
 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
   ; GCN-LABEL: name: i32_fastcc_i32_i32
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
   ; GCN-NEXT:   $vgpr0 = COPY [[ADD]](s32)
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GCN-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %add0 = add i32 %arg0, %arg1
   ret i32 %add0
 }
@@ -19,10 +21,11 @@ define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
 define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
   ; GCN-LABEL: name: i32_fastcc_i32_i32_stack_object
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
   ; GCN-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
@@ -30,7 +33,8 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
   ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (volatile store (s32) into %ir.gep, addrspace 5)
   ; GCN-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
   ; GCN-NEXT:   $vgpr0 = COPY [[ADD]](s32)
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GCN-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
   store volatile i32 9, i32 addrspace(5)* %gep
@@ -41,16 +45,17 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
 define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
   ; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32
   ; GCN-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[COPY1]](s32)
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
@@ -60,11 +65,12 @@ entry:
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
   ; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_stack_object
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
   ; GCN-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
@@ -73,8 +79,8 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b,
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32
   ; GCN-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[COPY1]](s32)
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
@@ -87,11 +93,12 @@ entry:
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
   ; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_callee_stack_object
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
   ; GCN-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
@@ -100,8 +107,8 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32_stack_object
   ; GCN-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[COPY1]](s32)
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32_stack_object, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
@@ -114,16 +121,17 @@ entry:
 define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
   ; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_unused_result
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32
   ; GCN-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[COPY1]](s32)
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
@@ -163,15 +171,17 @@ entry:
 define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval(i32) align 4 %arg1) #1 {
   ; GCN-LABEL: name: i32_fastcc_i32_byval_i32
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT:   liveins: $vgpr0, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX]](p5)
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p5) :: (dereferenceable load (s32) from %ir.arg1, addrspace 5)
   ; GCN-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[LOAD]]
   ; GCN-NEXT:   $vgpr0 = COPY [[ADD]](s32)
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GCN-NEXT:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %arg1.load = load i32, i32 addrspace(5)* %arg1, align 4
   %add0 = add i32 %arg0, %arg1.load
   ret i32 %add0
@@ -181,27 +191,29 @@ define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)*
 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32 addrspace(5)* byval(i32) %b.byval, i32 %c) #1 {
   ; GCN-LABEL: name: sibling_call_i32_fastcc_i32_byval_i32_byval_parent
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX]](p5)
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_byval_i32
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(p5) = COPY $sgpr32
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(p5) = COPY $sgpr32
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY3]], [[C]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY4]], [[C]](s32)
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; GCN-NEXT:   G_MEMCPY [[PTR_ADD]](p5), [[COPY1]](p5), [[C1]](s32), 0 :: (dereferenceable store (s32) into stack, addrspace 5), (dereferenceable load (s32) from %ir.b.byval, addrspace 5)
   ; GCN-NEXT:   $vgpr0 = COPY [[COPY]](s32)
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY5]](<4 x s32>)
   ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @i32_fastcc_i32_byval_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 4, implicit-def $scc
-  ; GCN-NEXT:   $vgpr0 = COPY [[COPY5]](s32)
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY6]](s32)
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; GCN-NEXT:   S_SETPC_B64_return [[COPY7]], implicit $vgpr0
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) %b.byval)
   ret i32 %ret
@@ -213,7 +225,7 @@ entry:
 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
   ; GCN-LABEL: name: sibling_call_i32_fastcc_i32_byval_i32
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -250,6 +262,7 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %lar
   ; GCN-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.2, align 16, addrspace 5)
   ; GCN-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
   ; GCN-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.1, addrspace 5)
+  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
   ; GCN-NEXT:   [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[C]](s32)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_byval_i32
@@ -257,8 +270,8 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %lar
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; GCN-NEXT:   G_MEMCPY [[FRAME_INDEX2]](p5), [[INTTOPTR]](p5), [[C1]](s32), 0 :: (dereferenceable store (s32) into %fixed-stack.0, align 16, addrspace 5), (dereferenceable load (s32) from `i32 addrspace(5)* inttoptr (i32 16 to i32 addrspace(5)*)`, align 16, addrspace 5)
   ; GCN-NEXT:   $vgpr0 = COPY [[COPY]](s32)
-  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY31]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY32:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY32]](<4 x s32>)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_byval_i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) inttoptr (i32 16 to i32 addrspace(5)*))
@@ -268,7 +281,7 @@ entry:
 define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
   ; GCN-LABEL: name: i32_fastcc_i32_i32_a32i32
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -307,11 +320,13 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
   ; GCN-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.1, addrspace 5)
   ; GCN-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; GCN-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.0, align 8, addrspace 5)
+  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
   ; GCN-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[LOAD1]]
   ; GCN-NEXT:   [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[LOAD2]]
   ; GCN-NEXT:   $vgpr0 = COPY [[ADD2]](s32)
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
+  ; GCN-NEXT:   S_SETPC_B64_return [[COPY32]], implicit $vgpr0
   %val_firststack = extractvalue [32 x i32] %large, 30
   %val_laststack = extractvalue [32 x i32] %large, 31
   %add0 = add i32 %arg0, %arg1
@@ -323,7 +338,7 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
   ; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_a32i32
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -362,6 +377,7 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x
   ; GCN-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.4, addrspace 5)
   ; GCN-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; GCN-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.3, align 8, addrspace 5)
+  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32_a32i32
   ; GCN-NEXT:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
   ; GCN-NEXT:   G_STORE [[LOAD]](s32), [[FRAME_INDEX3]](p5) :: (store (s32) into %fixed-stack.2, align 16, addrspace 5)
@@ -400,8 +416,8 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x
   ; GCN-NEXT:   $vgpr28 = COPY [[COPY28]](s32)
   ; GCN-NEXT:   $vgpr29 = COPY [[COPY29]](s32)
   ; GCN-NEXT:   $vgpr30 = COPY [[COPY30]](s32)
-  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY31]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY32:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY32]](<4 x s32>)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32_a32i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
@@ -411,7 +427,7 @@ entry:
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
   ; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -450,6 +466,7 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i
   ; GCN-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.4, addrspace 5)
   ; GCN-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; GCN-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.3, align 8, addrspace 5)
+  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
   ; GCN-NEXT:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
@@ -493,8 +510,8 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i
   ; GCN-NEXT:   $vgpr28 = COPY [[COPY28]](s32)
   ; GCN-NEXT:   $vgpr29 = COPY [[COPY29]](s32)
   ; GCN-NEXT:   $vgpr30 = COPY [[COPY30]](s32)
-  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY31]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY32:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY32]](<4 x s32>)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32_a32i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
@@ -510,22 +527,23 @@ entry:
 define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
   ; GCN-LABEL: name: no_sibling_call_callee_more_stack_space
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32_a32i32
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(p5) = COPY $sgpr32
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(p5) = COPY $sgpr32
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY2]], [[C1]](s32)
+  ; GCN-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY3]], [[C1]](s32)
   ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (store (s32) into stack, align 16, addrspace 5)
   ; GCN-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-  ; GCN-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY2]], [[C2]](s32)
+  ; GCN-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY3]], [[C2]](s32)
   ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack + 4, addrspace 5)
   ; GCN-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-  ; GCN-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY2]], [[C3]](s32)
+  ; GCN-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY3]], [[C3]](s32)
   ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD2]](p5) :: (store (s32) into stack + 8, align 8, addrspace 5)
   ; GCN-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[COPY1]](s32)
@@ -558,13 +576,14 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
   ; GCN-NEXT:   $vgpr28 = COPY [[C]](s32)
   ; GCN-NEXT:   $vgpr29 = COPY [[C]](s32)
   ; GCN-NEXT:   $vgpr30 = COPY [[C]](s32)
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>)
   ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @i32_fastcc_i32_i32_a32i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 12, implicit-def $scc
-  ; GCN-NEXT:   $vgpr0 = COPY [[COPY4]](s32)
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY5]](s32)
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GCN-NEXT:   S_SETPC_B64_return [[COPY6]], implicit $vgpr0
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
   ret i32 %ret
@@ -574,26 +593,27 @@ entry:
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
   ; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_other_call
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32
   ; GCN-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[COPY1]](s32)
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>)
   ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @i32_fastcc_i32_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   [[GV1:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @sibling_call_i32_fastcc_i32_i32
   ; GCN-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[COPY1]](s32)
-  ; GCN-NEXT:   $vgpr2 = COPY [[COPY4]](s32)
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY5]](<4 x s32>)
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY5]](s32)
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY6]](<4 x s32>)
   ; GCN-NEXT:   SI_TCRETURN [[GV1]](p0), @sibling_call_i32_fastcc_i32_i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 entry:
   %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
@@ -606,7 +626,7 @@ entry:
 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
   ; GCN-LABEL: name: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -645,6 +665,7 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i3
   ; GCN-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.4, addrspace 5)
   ; GCN-NEXT:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; GCN-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.3, align 8, addrspace 5)
+  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
   ; GCN-NEXT:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
@@ -688,8 +709,8 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i3
   ; GCN-NEXT:   $vgpr28 = COPY [[COPY28]](s32)
   ; GCN-NEXT:   $vgpr29 = COPY [[COPY29]](s32)
   ; GCN-NEXT:   $vgpr30 = COPY [[COPY30]](s32)
-  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY31]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY32:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY32]](<4 x s32>)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32_a32i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
@@ -702,7 +723,7 @@ entry:
 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
   ; GCN-LABEL: name: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -749,6 +770,7 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg
   ; GCN-NEXT:   [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load (s32) from %fixed-stack.4, addrspace 5)
   ; GCN-NEXT:   [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; GCN-NEXT:   [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load (s32) from %fixed-stack.3, align 8, addrspace 5)
+  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GCN-NEXT:   [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
@@ -793,8 +815,8 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg
   ; GCN-NEXT:   $vgpr28 = COPY [[C1]](s32)
   ; GCN-NEXT:   $vgpr29 = COPY [[C1]](s32)
   ; GCN-NEXT:   $vgpr30 = COPY [[C1]](s32)
-  ; GCN-NEXT:   [[COPY31:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY31]](<4 x s32>)
+  ; GCN-NEXT:   [[COPY32:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY32]](<4 x s32>)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32_a32i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
@@ -809,7 +831,7 @@ declare hidden void @void_fastcc_multi_byval(i32 %a, [3 x i32] addrspace(5)* byv
 define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
   ; GCN-LABEL: name: sibling_call_fastcc_multi_byval
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -918,6 +940,7 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
   ; GCN-NEXT:   [[LOAD32:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX32]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5)
   ; GCN-NEXT:   [[FRAME_INDEX33:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
   ; GCN-NEXT:   [[LOAD33:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX33]](p5) :: (invariant load (s32) from %fixed-stack.2, addrspace 5)
+  ; GCN-NEXT:   [[COPY39:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
   ; GCN-NEXT:   [[FRAME_INDEX34:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca0
@@ -933,14 +956,14 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
   ; GCN-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX35]], [[C3]](s32)
   ; GCN-NEXT:   G_STORE [[C1]](s64), [[PTR_ADD2]](p5) :: (store (s64) into %ir.alloca1 + 8, addrspace 5)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @void_fastcc_multi_byval
-  ; GCN-NEXT:   [[COPY39:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY42:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY43:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GCN-NEXT:   [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GCN-NEXT:   [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN-NEXT:   [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN-NEXT:   [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[FRAME_INDEX36:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
   ; GCN-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; GCN-NEXT:   G_MEMCPY [[FRAME_INDEX36]](p5), [[FRAME_INDEX34]](p5), [[C4]](s32), 0 :: (dereferenceable store (s96) into %fixed-stack.1, align 16, addrspace 5), (dereferenceable load (s96) from %ir.alloca0, align 16, addrspace 5)
@@ -948,16 +971,16 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
   ; GCN-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
   ; GCN-NEXT:   G_MEMCPY [[FRAME_INDEX37]](p5), [[FRAME_INDEX35]](p5), [[C5]](s32), 0 :: (dereferenceable store (s128) into %fixed-stack.0, addrspace 5), (dereferenceable load (s128) from %ir.alloca1, align 8, addrspace 5)
   ; GCN-NEXT:   $vgpr0 = COPY [[COPY8]](s32)
-  ; GCN-NEXT:   [[COPY47:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY47]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY39]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY40]](p4)
-  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY41]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY42]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY43]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY44]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY45]](s32)
-  ; GCN-NEXT:   $vgpr31 = COPY [[COPY46]](s32)
+  ; GCN-NEXT:   [[COPY48:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY48]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY40]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY41]](p4)
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY42]](p4)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY43]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY44]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY45]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY46]](s32)
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY47]](s32)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @void_fastcc_multi_byval, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
 entry:
   %alloca0 = alloca [3 x i32], align 16, addrspace(5)
@@ -974,7 +997,7 @@ declare hidden void @void_fastcc_byval_and_stack_passed([3 x i32] addrspace(5)*
 define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 {
   ; GCN-LABEL: name: sibling_call_byval_and_stack_passed
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -1083,6 +1106,7 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
   ; GCN-NEXT:   [[LOAD32:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX32]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5)
   ; GCN-NEXT:   [[FRAME_INDEX33:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; GCN-NEXT:   [[LOAD33:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX33]](p5) :: (invariant load (s32) from %fixed-stack.3, addrspace 5)
+  ; GCN-NEXT:   [[COPY39:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
   ; GCN-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GCN-NEXT:   [[FRAME_INDEX34:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
@@ -1094,14 +1118,14 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
   ; GCN-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C3]](s32)
   ; GCN-NEXT:   G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into %ir.alloca + 8, addrspace 5)
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @void_fastcc_byval_and_stack_passed
-  ; GCN-NEXT:   [[COPY39:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY42:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY43:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GCN-NEXT:   [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GCN-NEXT:   [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN-NEXT:   [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN-NEXT:   [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[FRAME_INDEX35:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
   ; GCN-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; GCN-NEXT:   G_MEMCPY [[FRAME_INDEX35]](p5), [[FRAME_INDEX34]](p5), [[C4]](s32), 0 :: (dereferenceable store (s96) into %fixed-stack.2, align 16, addrspace 5), (dereferenceable load (s96) from %ir.alloca, align 16, addrspace 5)
@@ -1140,16 +1164,16 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
   ; GCN-NEXT:   $vgpr28 = COPY [[C1]](s32)
   ; GCN-NEXT:   $vgpr29 = COPY [[C1]](s32)
   ; GCN-NEXT:   $vgpr30 = COPY [[C1]](s32)
-  ; GCN-NEXT:   [[COPY47:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY47]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY39]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY40]](p4)
-  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY41]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY42]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY43]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY44]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY45]](s32)
-  ; GCN-NEXT:   $vgpr31 = COPY [[COPY46]](s32)
+  ; GCN-NEXT:   [[COPY48:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY48]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY40]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY41]](p4)
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY42]](p4)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY43]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY44]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY45]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY46]](s32)
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY47]](s32)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @void_fastcc_byval_and_stack_passed, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
 entry:
   %alloca = alloca [3 x i32], align 16, addrspace(5)
@@ -1163,7 +1187,7 @@ declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0)
 define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
   ; GCN-LABEL: name: sibling_call_i64_fastcc_i64
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -1176,28 +1200,29 @@ define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i64_fastcc_i64
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
-  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $vgpr31 = COPY [[COPY17]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY13]](p4)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY15]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY16]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @i64_fastcc_i64, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
 entry:
   %ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a)
@@ -1209,7 +1234,7 @@ declare hidden fastcc i8 addrspace(1)* @p1i8_fastcc_p1i8(i8 addrspace(1)* %arg0)
 define hidden fastcc i8 addrspace(1)* @sibling_call_p1i8_fastcc_p1i8(i8 addrspace(1)* %a) #1 {
   ; GCN-LABEL: name: sibling_call_p1i8_fastcc_p1i8
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -1222,28 +1247,29 @@ define hidden fastcc i8 addrspace(1)* @sibling_call_p1i8_fastcc_p1i8(i8 addrspac
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @p1i8_fastcc_p1i8
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p1)
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
-  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $vgpr31 = COPY [[COPY17]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY13]](p4)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY15]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY16]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @p1i8_fastcc_p1i8, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
 entry:
   %ret = tail call fastcc i8 addrspace(1)* @p1i8_fastcc_p1i8(i8 addrspace(1)* %a)
@@ -1255,7 +1281,7 @@ declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0)
 define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
   ; GCN-LABEL: name: sibling_call_i16_fastcc_i16
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -1267,27 +1293,28 @@ define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
   ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY8]](s32)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i16_fastcc_i16
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
   ; GCN-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $vgpr31 = COPY [[COPY16]](s32)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY17]](s32)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @i16_fastcc_i16, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
 entry:
   %ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a)
@@ -1299,7 +1326,7 @@ declare hidden fastcc half @f16_fastcc_f16(half %arg0)
 define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
   ; GCN-LABEL: name: sibling_call_f16_fastcc_f16
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -1311,27 +1338,28 @@ define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
   ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY8]](s32)
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @f16_fastcc_f16
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
   ; GCN-NEXT:   $vgpr0 = COPY [[ANYEXT]](s32)
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $vgpr31 = COPY [[COPY16]](s32)
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY17]](s32)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @f16_fastcc_f16, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
 entry:
   %ret = tail call fastcc half @f16_fastcc_f16(half %a)
@@ -1343,7 +1371,7 @@ declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0)
 define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 {
   ; GCN-LABEL: name: sibling_call_v3i16_fastcc_v3i16
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -1358,30 +1386,31 @@ define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; GCN-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>), [[DEF]](<2 x s16>)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @v3i16_fastcc_v3i16
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
   ; GCN-NEXT:   [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<3 x s16>), [[DEF1]](<3 x s16>)
   ; GCN-NEXT:   [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
   ; GCN-NEXT:   $vgpr0 = COPY [[UV2]](<2 x s16>)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV3]](<2 x s16>)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
-  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $vgpr31 = COPY [[COPY17]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY13]](p4)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY15]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY16]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @v3i16_fastcc_v3i16, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
 entry:
   %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a)
@@ -1393,7 +1422,7 @@ declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0)
 define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 {
   ; GCN-LABEL: name: sibling_call_v4i16_fastcc_v4i16
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -1406,28 +1435,29 @@ define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; GCN-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>)
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @v4i16_fastcc_v4i16
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>)
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](<2 x s16>)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](<2 x s16>)
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
-  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $vgpr31 = COPY [[COPY17]](s32)
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY13]](p4)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY15]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY16]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @v4i16_fastcc_v4i16, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
 entry:
   %ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a)
@@ -1439,7 +1469,7 @@ declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0)
 define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 {
   ; GCN-LABEL: name: sibling_call_v2i64_fastcc_v2i64
   ; GCN: bb.1.entry:
-  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -1456,30 +1486,31 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1
   ; GCN-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
   ; GCN-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
   ; GCN-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @v2i64_fastcc_v2i64
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>)
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; GCN-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GCN-NEXT:   $vgpr3 = COPY [[UV3]](s32)
-  ; GCN-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY12]](p4)
-  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY13]](p4)
-  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY14]](p4)
-  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY15]](s64)
-  ; GCN-NEXT:   $sgpr12 = COPY [[COPY16]](s32)
-  ; GCN-NEXT:   $sgpr13 = COPY [[COPY17]](s32)
-  ; GCN-NEXT:   $sgpr14 = COPY [[COPY18]](s32)
-  ; GCN-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
+  ; GCN-NEXT:   [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+  ; GCN-NEXT:   $sgpr4_sgpr5 = COPY [[COPY13]](p4)
+  ; GCN-NEXT:   $sgpr6_sgpr7 = COPY [[COPY14]](p4)
+  ; GCN-NEXT:   $sgpr8_sgpr9 = COPY [[COPY15]](p4)
+  ; GCN-NEXT:   $sgpr10_sgpr11 = COPY [[COPY16]](s64)
+  ; GCN-NEXT:   $sgpr12 = COPY [[COPY17]](s32)
+  ; GCN-NEXT:   $sgpr13 = COPY [[COPY18]](s32)
+  ; GCN-NEXT:   $sgpr14 = COPY [[COPY19]](s32)
+  ; GCN-NEXT:   $vgpr31 = COPY [[COPY20]](s32)
   ; GCN-NEXT:   SI_TCRETURN [[GV]](p0), @v2i64_fastcc_v2i64, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
 entry:
   %ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll
index 87b53c69f6120..1147647ec514a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll
@@ -6,7 +6,7 @@ declare hidden void @external_void_func_void()
 define void @tail_call_void_func_void() {
   ; CHECK-LABEL: name: tail_call_void_func_void
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
@@ -16,25 +16,26 @@ define void @tail_call_void_func_void() {
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_void_func_void
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY8]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY11]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY12]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY13]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   SI_TCRETURN [[GV]](p0), @external_void_func_void, 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
   tail call void @external_void_func_void()
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
index 815f9b1a07fe9..87ef4479d4bf5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
@@ -187,21 +187,25 @@ define float @ds_fmax_f32_vv(float addrspace(3)* %ptr, float %val) {
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv
   ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR:   liveins: $vgpr0, $vgpr1
+  ; GFX8-MIR:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GFX8-MIR:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8-MIR:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-MIR:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX8-MIR:   $m0 = S_MOV_B32 -1
   ; GFX8-MIR:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
   ; GFX8-MIR:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR:   SI_RETURN implicit $vgpr0
+  ; GFX8-MIR:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GFX8-MIR:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv
   ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR:   liveins: $vgpr0, $vgpr1
+  ; GFX9-MIR:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GFX9-MIR:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX9-MIR:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-MIR:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX9-MIR:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
   ; GFX9-MIR:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR:   SI_RETURN implicit $vgpr0
+  ; GFX9-MIR:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GFX9-MIR:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %ret = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false)
   ret float %ret
 }
@@ -223,21 +227,25 @@ define float @ds_fmax_f32_vv_offset(float addrspace(3)* %ptr, float %val) {
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_offset
   ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR:   liveins: $vgpr0, $vgpr1
+  ; GFX8-MIR:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GFX8-MIR:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8-MIR:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-MIR:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX8-MIR:   $m0 = S_MOV_B32 -1
   ; GFX8-MIR:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
   ; GFX8-MIR:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR:   SI_RETURN implicit $vgpr0
+  ; GFX8-MIR:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GFX8-MIR:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_offset
   ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR:   liveins: $vgpr0, $vgpr1
+  ; GFX9-MIR:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GFX9-MIR:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX9-MIR:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-MIR:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX9-MIR:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
   ; GFX9-MIR:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR:   SI_RETURN implicit $vgpr0
+  ; GFX9-MIR:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GFX9-MIR:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %gep = getelementptr float, float addrspace(3)* %ptr, i32 128
   %ret = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false)
   ret float %ret
@@ -260,19 +268,23 @@ define void @ds_fmax_f32_vv_nortn(float addrspace(3)* %ptr, float %val) {
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_nortn
   ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR:   liveins: $vgpr0, $vgpr1
+  ; GFX8-MIR:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GFX8-MIR:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8-MIR:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-MIR:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX8-MIR:   $m0 = S_MOV_B32 -1
   ; GFX8-MIR:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX8-MIR:   SI_RETURN
+  ; GFX8-MIR:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GFX8-MIR:   S_SETPC_B64_return [[COPY3]]
   ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_nortn
   ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR:   liveins: $vgpr0, $vgpr1
+  ; GFX9-MIR:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GFX9-MIR:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX9-MIR:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-MIR:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX9-MIR:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3)
-  ; GFX9-MIR:   SI_RETURN
+  ; GFX9-MIR:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GFX9-MIR:   S_SETPC_B64_return [[COPY3]]
   %ret = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false)
   ret void
 }
@@ -294,19 +306,23 @@ define void @ds_fmax_f32_vv_offset_nortn(float addrspace(3)* %ptr, float %val) {
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_offset_nortn
   ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR:   liveins: $vgpr0, $vgpr1
+  ; GFX8-MIR:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GFX8-MIR:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8-MIR:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-MIR:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX8-MIR:   $m0 = S_MOV_B32 -1
   ; GFX8-MIR:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX8-MIR:   SI_RETURN
+  ; GFX8-MIR:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GFX8-MIR:   S_SETPC_B64_return [[COPY3]]
   ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_offset_nortn
   ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR:   liveins: $vgpr0, $vgpr1
+  ; GFX9-MIR:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GFX9-MIR:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX9-MIR:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-MIR:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX9-MIR:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3)
-  ; GFX9-MIR:   SI_RETURN
+  ; GFX9-MIR:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GFX9-MIR:   S_SETPC_B64_return [[COPY3]]
   %gep = getelementptr float, float addrspace(3)* %ptr, i32 128
   %ret = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false)
   ret void
@@ -329,21 +345,25 @@ define float @ds_fmax_f32_vv_volatile(float addrspace(3)* %ptr, float %val) {
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_volatile
   ; GFX8-MIR: bb.1 (%ir-block.0):
-  ; GFX8-MIR:   liveins: $vgpr0, $vgpr1
+  ; GFX8-MIR:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GFX8-MIR:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8-MIR:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-MIR:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX8-MIR:   $m0 = S_MOV_B32 -1
   ; GFX8-MIR:   [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (volatile load store (s32) on %ir.ptr, addrspace 3)
   ; GFX8-MIR:   $vgpr0 = COPY [[DS_MAX_RTN_F32_]]
-  ; GFX8-MIR:   SI_RETURN implicit $vgpr0
+  ; GFX8-MIR:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GFX8-MIR:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_volatile
   ; GFX9-MIR: bb.1 (%ir-block.0):
-  ; GFX9-MIR:   liveins: $vgpr0, $vgpr1
+  ; GFX9-MIR:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GFX9-MIR:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX9-MIR:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-MIR:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GFX9-MIR:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (volatile load store (s32) on %ir.ptr, addrspace 3)
   ; GFX9-MIR:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
-  ; GFX9-MIR:   SI_RETURN implicit $vgpr0
+  ; GFX9-MIR:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; GFX9-MIR:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
   %ret = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 true)
   ret float %ret
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index b4a7d4f9855aa..7d604236777dd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -269,22 +269,22 @@ define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x)
 ; GFX10-NEXT:    v_lshrrev_b16 v6, 8, v1
 ; GFX10-NEXT:    ds_write_b8 v0, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX10-NEXT:    v_lshrrev_b16 v7, 8, v2
-; GFX10-NEXT:    v_lshrrev_b16 v8, 8, v5
+; GFX10-NEXT:    v_lshrrev_b16 v8, 8, v2
+; GFX10-NEXT:    v_lshrrev_b16 v7, 8, v5
 ; GFX10-NEXT:    ds_write_b8 v0, v2 offset:4
 ; GFX10-NEXT:    ds_write_b8 v0, v6 offset:1
 ; GFX10-NEXT:    ds_write_b8 v0, v5 offset:2
-; GFX10-NEXT:    v_lshrrev_b16 v5, 8, v3
+; GFX10-NEXT:    ds_write_b8 v0, v7 offset:3
 ; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v1
-; GFX10-NEXT:    ds_write_b8 v0, v8 offset:3
-; GFX10-NEXT:    ds_write_b8 v0, v7 offset:5
+; GFX10-NEXT:    ds_write_b8 v0, v8 offset:5
 ; GFX10-NEXT:    ds_write_b8 v0, v1 offset:6
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX10-NEXT:    v_lshrrev_b16 v5, 8, v3
 ; GFX10-NEXT:    ds_write_b8 v0, v2 offset:7
 ; GFX10-NEXT:    ds_write_b8 v0, v3 offset:8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX10-NEXT:    ds_write_b8 v0, v5 offset:9
 ; GFX10-NEXT:    v_lshrrev_b16 v3, 8, v1
+; GFX10-NEXT:    ds_write_b8 v0, v5 offset:9
 ; GFX10-NEXT:    v_lshrrev_b16 v5, 8, v4
 ; GFX10-NEXT:    ds_write_b8 v0, v1 offset:10
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v2
@@ -344,21 +344,21 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x)
 ; GFX10-NEXT:    ds_write_b8 v0, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b16 v6, 8, v2
+; GFX10-NEXT:    v_lshrrev_b16 v7, 8, v4
 ; GFX10-NEXT:    ds_write_b8 v0, v2 offset:4
-; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    ds_write_b8 v0, v5 offset:1
 ; GFX10-NEXT:    ds_write_b8 v0, v4 offset:2
+; GFX10-NEXT:    ds_write_b8 v0, v7 offset:3
 ; GFX10-NEXT:    v_lshrrev_b16 v4, 8, v1
-; GFX10-NEXT:    ds_write_b8 v0, v2 offset:3
-; GFX10-NEXT:    ds_write_b8 v0, v6 offset:5
-; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v3
+; GFX10-NEXT:    v_lshrrev_b16 v5, 8, v3
 ; GFX10-NEXT:    ds_write_b8 v0, v1 offset:6
-; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v7
+; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v2
+; GFX10-NEXT:    ds_write_b8 v0, v6 offset:5
 ; GFX10-NEXT:    ds_write_b8 v0, v4 offset:7
 ; GFX10-NEXT:    ds_write_b8 v0, v3 offset:8
-; GFX10-NEXT:    ds_write_b8 v0, v2 offset:9
-; GFX10-NEXT:    ds_write_b8 v0, v7 offset:10
+; GFX10-NEXT:    ds_write_b8 v0, v5 offset:9
+; GFX10-NEXT:    ds_write_b8 v0, v2 offset:10
 ; GFX10-NEXT:    ds_write_b8 v0, v1 offset:11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir
index ab05e6ef7ead5..da576626e9954 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir
@@ -9,20 +9,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_min_max_ValK0_K1_i32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY3]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 -12
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_SMAX %0, %7
@@ -30,7 +33,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_SMIN %3, %8
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -40,20 +44,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: min_max_ValK0_K1_i32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY3]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 -12
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_SMAX %7, %0
@@ -61,7 +68,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_SMIN %3, %8
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -71,20 +79,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_min_K1max_ValK0__i32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY3]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 -12
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_SMAX %0, %7
@@ -92,7 +103,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_SMIN %8, %3
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -102,20 +114,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_min_K1max_K0Val__i32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY3]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 -12
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_SMAX %7, %0
@@ -123,7 +138,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_SMIN %8, %3
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -133,20 +149,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_max_min_ValK1_K0_i32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY3]], [[COPY2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 17
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_SMIN %0, %7
@@ -154,7 +173,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_SMAX %3, %8
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -164,20 +184,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_max_min_K1Val_K0_i32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY3]], [[COPY2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 17
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_SMIN %7, %0
@@ -185,7 +208,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_SMAX %3, %8
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -195,20 +219,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_max_K0min_ValK1__i32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY3]], [[COPY2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 17
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_SMIN %0, %7
@@ -216,7 +243,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_SMAX %8, %3
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -226,20 +254,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_max_K0min_K1Val__i32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY3]], [[COPY2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 17
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_SMIN %7, %0
@@ -247,7 +278,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_SMAX %8, %3
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -257,12 +289,13 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_max_K0min_K1Val__v2i16
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
@@ -272,8 +305,10 @@ body: |
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
     ; CHECK-NEXT: [[SMAX:%[0-9]+]]:vgpr(<2 x s16>) = G_SMAX [[COPY3]], [[SMIN]]
     ; CHECK-NEXT: $vgpr0 = COPY [[SMAX]](<2 x s16>)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(<2 x s16>) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %9:sgpr(s32) = G_CONSTANT i32 17
     %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %9(s32), %9(s32)
     %10:sgpr(s32) = G_CONSTANT i32 -12
@@ -283,7 +318,8 @@ body: |
     %12:vgpr(<2 x s16>) = COPY %5(<2 x s16>)
     %7:vgpr(<2 x s16>) = G_SMAX %12, %4
     $vgpr0 = COPY %7(<2 x s16>)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %8:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %8, implicit $vgpr0
 ...
 
 ---
@@ -325,20 +361,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_non_inline_constant_i32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY3]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 -12
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_SMAX %0, %7
@@ -346,5 +385,6 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_SMIN %3, %8
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir
index 4ecc912fc7e44..8c5d796a60e06 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir
@@ -9,20 +9,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_min_max_ValK0_K1_u32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY3]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 12
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_UMAX %0, %7
@@ -30,7 +33,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_UMIN %3, %8
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -40,20 +44,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: min_max_ValK0_K1_i32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY3]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 12
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_UMAX %7, %0
@@ -61,7 +68,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_UMIN %3, %8
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -71,20 +79,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_min_K1max_ValK0__u32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY3]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 12
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_UMAX %0, %7
@@ -92,7 +103,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_UMIN %8, %3
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -102,20 +114,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_min_K1max_K0Val__u32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY3]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 12
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_UMAX %7, %0
@@ -123,7 +138,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_UMIN %8, %3
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -133,20 +149,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_max_min_ValK1_K0_u32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY3]], [[COPY2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 17
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_UMIN %0, %7
@@ -154,7 +173,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_UMAX %3, %8
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -164,20 +184,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_max_min_K1Val_K0_u32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY3]], [[COPY2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 17
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_UMIN %7, %0
@@ -185,7 +208,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_UMAX %3, %8
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -195,20 +219,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_max_K0min_ValK1__u32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY3]], [[COPY2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 17
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_UMIN %0, %7
@@ -216,7 +243,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_UMAX %8, %3
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -226,20 +254,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_max_K0min_K1Val__u32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY3]], [[COPY2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 17
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_UMIN %7, %0
@@ -247,7 +278,8 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_UMAX %8, %3
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...
 
 ---
@@ -257,12 +289,13 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_max_K0min_K1Val__v2u16
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
@@ -272,8 +305,10 @@ body: |
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
     ; CHECK-NEXT: [[UMAX:%[0-9]+]]:vgpr(<2 x s16>) = G_UMAX [[COPY3]], [[UMIN]]
     ; CHECK-NEXT: $vgpr0 = COPY [[UMAX]](<2 x s16>)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(<2 x s16>) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %9:sgpr(s32) = G_CONSTANT i32 17
     %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %9(s32), %9(s32)
     %10:sgpr(s32) = G_CONSTANT i32 12
@@ -283,7 +318,8 @@ body: |
     %12:vgpr(<2 x s16>) = COPY %5(<2 x s16>)
     %7:vgpr(<2 x s16>) = G_UMAX %12, %4
     $vgpr0 = COPY %7(<2 x s16>)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %8:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %8, implicit $vgpr0
 ...
 
 ---
@@ -326,20 +362,23 @@ regBankSelected: true
 tracksRegLiveness: true
 body: |
   bb.1:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_non_inline_constant_i32
-    ; CHECK: liveins: $vgpr0
+    ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY3]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
-    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
     %0:vgpr(s32) = COPY $vgpr0
+    %1:sgpr_64 = COPY $sgpr30_sgpr31
     %2:sgpr(s32) = G_CONSTANT i32 12
     %7:vgpr(s32) = COPY %2(s32)
     %3:vgpr(s32) = G_UMAX %0, %7
@@ -347,5 +386,6 @@ body: |
     %8:vgpr(s32) = COPY %4(s32)
     %5:vgpr(s32) = G_UMIN %3, %8
     $vgpr0 = COPY %5(s32)
-    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
+    %6:ccr_sgpr_64 = COPY %1
+    S_SETPC_B64_return %6, implicit $vgpr0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
index 6416026570058..d9385251a4dcb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
@@ -401,13 +401,13 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_rndne_f16_e32 v3, v0
+; GFX10-NEXT:    v_rndne_f16_e32 v2, v0
 ; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_rndne_f16_e32 v4, v1
+; GFX10-NEXT:    v_rndne_f16_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v3, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v1, v4, v2, v1
+; GFX10-NEXT:    v_and_or_b32 v0, v2, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, v3, v4, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x)
   ret <4 x half> %roundeven

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 1389e96f592c9..c118e757ef539 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -642,25 +642,25 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v7
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v3, v6, v2, v4
-; GFX10-NEXT:    v_and_or_b32 v2, v8, v2, v5
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, v2
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, v6
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v7, v4
+; GFX10-NEXT:    v_and_or_b32 v3, v8, v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v4, 24
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_mov_b32_e32 v4, 24
-; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_pk_add_i16 v1, v3, v2 clamp
+; GFX10-NEXT:    v_pk_add_i16 v1, v2, v3 clamp
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 8fee6c91474c4..1f8a322dbdd12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -642,25 +642,25 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v7
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v3, v6, v2, v4
-; GFX10-NEXT:    v_and_or_b32 v2, v8, v2, v5
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, v2
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, v6
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v7, v4
+; GFX10-NEXT:    v_and_or_b32 v3, v8, v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v4, 24
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_mov_b32_e32 v4, 24
-; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_pk_sub_i16 v1, v3, v2 clamp
+; GFX10-NEXT:    v_pk_sub_i16 v1, v2, v3 clamp
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 8e0d114346920..503b45d49991c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -471,25 +471,25 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v7
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v3, v6, v2, v4
-; GFX10-NEXT:    v_and_or_b32 v2, v8, v2, v5
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, v2
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, v6
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v7, v4
+; GFX10-NEXT:    v_and_or_b32 v3, v8, v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v4, 24
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_mov_b32_e32 v4, 24
-; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_pk_add_u16 v1, v3, v2 clamp
+; GFX10-NEXT:    v_pk_add_u16 v1, v2, v3 clamp
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 82959c8f22cb1..a4da5822dac57 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -459,25 +459,25 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v7
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_and_or_b32 v3, v6, v2, v4
-; GFX10-NEXT:    v_and_or_b32 v2, v8, v2, v5
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, v2
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, v6
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v7, v4
+; GFX10-NEXT:    v_and_or_b32 v3, v8, v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v4, 24
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_mov_b32_e32 v4, 24
-; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_pk_sub_u16 v1, v3, v2 clamp
+; GFX10-NEXT:    v_pk_sub_u16 v1, v2, v3 clamp
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]

diff  --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index dcde1b181b0cf..11084620176dd 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -29,15 +29,15 @@ define void @parent_func_missing_inputs() #0 {
 ; FIXEDABI-NEXT:    s_add_u32 s16, s16, requires_all_inputs at rel32@lo+4
 ; FIXEDABI-NEXT:    s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
 ; FIXEDABI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; FIXEDABI-NEXT:    v_readlane_b32 s31, v40, 1
-; FIXEDABI-NEXT:    v_readlane_b32 s30, v40, 0
+; FIXEDABI-NEXT:    v_readlane_b32 s4, v40, 0
+; FIXEDABI-NEXT:    v_readlane_b32 s5, v40, 1
 ; FIXEDABI-NEXT:    s_addk_i32 s32, 0xfc00
 ; FIXEDABI-NEXT:    v_readlane_b32 s33, v40, 2
-; FIXEDABI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; FIXEDABI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; FIXEDABI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; FIXEDABI-NEXT:    s_mov_b64 exec, s[4:5]
+; FIXEDABI-NEXT:    s_mov_b64 exec, s[6:7]
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    s_setpc_b64 s[30:31]
+; FIXEDABI-NEXT:    s_setpc_b64 s[4:5]
   call void @requires_all_inputs()
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 65f5c623a4e7e..546bba7146b44 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -173,9 +173,9 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      no_stack_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x24{{$}}
-; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
-; GCN-NEXT:        .vgpr_count:     0x3{{$}}
+; GCN-NEXT:        .sgpr_count:     0x26{{$}}
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
+; GCN-NEXT:        .vgpr_count:     0x2{{$}}
 ; GCN-NEXT:      no_stack_extern_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
@@ -213,9 +213,9 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:        .vgpr_count:     0x2{{$}}
 ; GCN-NEXT:      simple_stack_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x24{{$}}
+; GCN-NEXT:        .sgpr_count:     0x26{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
-; GCN-NEXT:        .vgpr_count:     0x4{{$}}
+; GCN-NEXT:        .vgpr_count:     0x3{{$}}
 ; GCN-NEXT:      simple_stack_extern_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; GFX8-NEXT:        .sgpr_count:     0x28{{$}}

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index 0f86d0ee03607..8ea4dbed7866f 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -903,80 +903,80 @@ define void @spill_func(i32 addrspace(1)* %arg) #0 {
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt expcnt(1)
-; CHECK-NEXT:    v_writelane_b32 v0, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v0, s33, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s34, 3
-; CHECK-NEXT:    v_writelane_b32 v0, s35, 4
-; CHECK-NEXT:    v_writelane_b32 v0, s36, 5
-; CHECK-NEXT:    v_writelane_b32 v0, s37, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s38, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s39, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s40, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s41, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s42, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s43, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s44, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s45, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s46, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s47, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s48, 17
-; CHECK-NEXT:    v_writelane_b32 v0, s49, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s50, 19
-; CHECK-NEXT:    v_writelane_b32 v0, s51, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s52, 21
-; CHECK-NEXT:    v_writelane_b32 v0, s53, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s54, 23
-; CHECK-NEXT:    v_writelane_b32 v0, s55, 24
-; CHECK-NEXT:    v_writelane_b32 v0, s56, 25
-; CHECK-NEXT:    v_writelane_b32 v0, s57, 26
-; CHECK-NEXT:    v_writelane_b32 v0, s58, 27
-; CHECK-NEXT:    v_writelane_b32 v0, s59, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s60, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s61, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s62, 31
-; CHECK-NEXT:    v_writelane_b32 v0, s63, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s64, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s65, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s66, 35
-; CHECK-NEXT:    v_writelane_b32 v0, s67, 36
-; CHECK-NEXT:    v_writelane_b32 v0, s68, 37
-; CHECK-NEXT:    v_writelane_b32 v0, s69, 38
-; CHECK-NEXT:    v_writelane_b32 v0, s70, 39
-; CHECK-NEXT:    v_writelane_b32 v0, s71, 40
-; CHECK-NEXT:    v_writelane_b32 v0, s72, 41
-; CHECK-NEXT:    v_writelane_b32 v0, s73, 42
-; CHECK-NEXT:    v_writelane_b32 v0, s74, 43
-; CHECK-NEXT:    v_writelane_b32 v0, s75, 44
-; CHECK-NEXT:    v_writelane_b32 v0, s76, 45
-; CHECK-NEXT:    v_writelane_b32 v0, s77, 46
-; CHECK-NEXT:    v_writelane_b32 v0, s78, 47
-; CHECK-NEXT:    v_writelane_b32 v0, s79, 48
-; CHECK-NEXT:    v_writelane_b32 v0, s80, 49
-; CHECK-NEXT:    v_writelane_b32 v0, s81, 50
-; CHECK-NEXT:    v_writelane_b32 v0, s82, 51
-; CHECK-NEXT:    v_writelane_b32 v0, s83, 52
-; CHECK-NEXT:    v_writelane_b32 v0, s84, 53
-; CHECK-NEXT:    v_writelane_b32 v0, s85, 54
-; CHECK-NEXT:    v_writelane_b32 v0, s86, 55
-; CHECK-NEXT:    v_writelane_b32 v0, s87, 56
-; CHECK-NEXT:    v_writelane_b32 v0, s88, 57
+; CHECK-NEXT:    v_writelane_b32 v0, s33, 0
+; CHECK-NEXT:    v_writelane_b32 v0, s34, 1
+; CHECK-NEXT:    v_writelane_b32 v0, s35, 2
+; CHECK-NEXT:    v_writelane_b32 v0, s36, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s37, 4
+; CHECK-NEXT:    v_writelane_b32 v0, s38, 5
+; CHECK-NEXT:    v_writelane_b32 v0, s39, 6
+; CHECK-NEXT:    v_writelane_b32 v0, s40, 7
+; CHECK-NEXT:    v_writelane_b32 v0, s41, 8
+; CHECK-NEXT:    v_writelane_b32 v0, s42, 9
+; CHECK-NEXT:    v_writelane_b32 v0, s43, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s44, 11
+; CHECK-NEXT:    v_writelane_b32 v0, s45, 12
+; CHECK-NEXT:    v_writelane_b32 v0, s46, 13
+; CHECK-NEXT:    v_writelane_b32 v0, s47, 14
+; CHECK-NEXT:    v_writelane_b32 v0, s48, 15
+; CHECK-NEXT:    v_writelane_b32 v0, s49, 16
+; CHECK-NEXT:    v_writelane_b32 v0, s50, 17
+; CHECK-NEXT:    v_writelane_b32 v0, s51, 18
+; CHECK-NEXT:    v_writelane_b32 v0, s52, 19
+; CHECK-NEXT:    v_writelane_b32 v0, s53, 20
+; CHECK-NEXT:    v_writelane_b32 v0, s54, 21
+; CHECK-NEXT:    v_writelane_b32 v0, s55, 22
+; CHECK-NEXT:    v_writelane_b32 v0, s56, 23
+; CHECK-NEXT:    v_writelane_b32 v0, s57, 24
+; CHECK-NEXT:    v_writelane_b32 v0, s58, 25
+; CHECK-NEXT:    v_writelane_b32 v0, s59, 26
+; CHECK-NEXT:    v_writelane_b32 v0, s60, 27
+; CHECK-NEXT:    v_writelane_b32 v0, s61, 28
+; CHECK-NEXT:    v_writelane_b32 v0, s62, 29
+; CHECK-NEXT:    v_writelane_b32 v0, s63, 30
+; CHECK-NEXT:    v_writelane_b32 v0, s64, 31
+; CHECK-NEXT:    v_writelane_b32 v0, s65, 32
+; CHECK-NEXT:    v_writelane_b32 v0, s66, 33
+; CHECK-NEXT:    v_writelane_b32 v0, s67, 34
+; CHECK-NEXT:    v_writelane_b32 v0, s68, 35
+; CHECK-NEXT:    v_writelane_b32 v0, s69, 36
+; CHECK-NEXT:    v_writelane_b32 v0, s70, 37
+; CHECK-NEXT:    v_writelane_b32 v0, s71, 38
+; CHECK-NEXT:    v_writelane_b32 v0, s72, 39
+; CHECK-NEXT:    v_writelane_b32 v0, s73, 40
+; CHECK-NEXT:    v_writelane_b32 v0, s74, 41
+; CHECK-NEXT:    v_writelane_b32 v0, s75, 42
+; CHECK-NEXT:    v_writelane_b32 v0, s76, 43
+; CHECK-NEXT:    v_writelane_b32 v0, s77, 44
+; CHECK-NEXT:    v_writelane_b32 v0, s78, 45
+; CHECK-NEXT:    v_writelane_b32 v0, s79, 46
+; CHECK-NEXT:    v_writelane_b32 v0, s80, 47
+; CHECK-NEXT:    v_writelane_b32 v0, s81, 48
+; CHECK-NEXT:    v_writelane_b32 v0, s82, 49
+; CHECK-NEXT:    v_writelane_b32 v0, s83, 50
+; CHECK-NEXT:    v_writelane_b32 v0, s84, 51
+; CHECK-NEXT:    v_writelane_b32 v0, s85, 52
+; CHECK-NEXT:    v_writelane_b32 v0, s86, 53
+; CHECK-NEXT:    v_writelane_b32 v0, s87, 54
+; CHECK-NEXT:    v_writelane_b32 v0, s88, 55
+; CHECK-NEXT:    v_writelane_b32 v0, s89, 56
+; CHECK-NEXT:    v_writelane_b32 v0, s90, 57
 ; CHECK-NEXT:    s_waitcnt expcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v1, s95, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s89, 58
-; CHECK-NEXT:    v_writelane_b32 v1, s96, 1
-; CHECK-NEXT:    v_writelane_b32 v0, s90, 59
-; CHECK-NEXT:    v_writelane_b32 v1, s97, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s91, 60
-; CHECK-NEXT:    v_writelane_b32 v1, s98, 3
-; CHECK-NEXT:    v_writelane_b32 v0, s92, 61
-; CHECK-NEXT:    v_writelane_b32 v1, s99, 4
-; CHECK-NEXT:    v_writelane_b32 v0, s93, 62
-; CHECK-NEXT:    v_writelane_b32 v1, s100, 5
-; CHECK-NEXT:    s_mov_b32 s31, s12
-; CHECK-NEXT:    v_writelane_b32 v0, s94, 63
-; CHECK-NEXT:    v_writelane_b32 v1, s101, 6
-; CHECK-NEXT:    s_cmp_eq_u32 s31, 0
+; CHECK-NEXT:    v_writelane_b32 v1, s97, 0
+; CHECK-NEXT:    v_writelane_b32 v0, s91, 58
+; CHECK-NEXT:    v_writelane_b32 v1, s98, 1
+; CHECK-NEXT:    v_writelane_b32 v0, s92, 59
+; CHECK-NEXT:    v_writelane_b32 v1, s99, 2
+; CHECK-NEXT:    v_writelane_b32 v0, s93, 60
+; CHECK-NEXT:    v_writelane_b32 v1, s100, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s94, 61
+; CHECK-NEXT:    v_writelane_b32 v1, s101, 4
+; CHECK-NEXT:    v_writelane_b32 v0, s95, 62
+; CHECK-NEXT:    v_writelane_b32 v1, s30, 5
+; CHECK-NEXT:    s_mov_b32 s29, s12
+; CHECK-NEXT:    v_writelane_b32 v0, s96, 63
+; CHECK-NEXT:    v_writelane_b32 v1, s31, 6
+; CHECK-NEXT:    s_cmp_eq_u32 s29, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -1336,6 +1336,7 @@ define void @spill_func(i32 addrspace(1)* %arg) #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use s5
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s4, v1, 5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use s6
 ; CHECK-NEXT:    ;;#ASMEND
@@ -1630,83 +1631,82 @@ define void @spill_func(i32 addrspace(1)* %arg) #0 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; reg use vcc_hi
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s101, v1, 6
-; CHECK-NEXT:    v_readlane_b32 s100, v1, 5
-; CHECK-NEXT:    v_readlane_b32 s99, v1, 4
-; CHECK-NEXT:    v_readlane_b32 s98, v1, 3
-; CHECK-NEXT:    v_readlane_b32 s97, v1, 2
-; CHECK-NEXT:    v_readlane_b32 s96, v1, 1
-; CHECK-NEXT:    v_readlane_b32 s95, v1, 0
-; CHECK-NEXT:    v_readlane_b32 s94, v0, 63
-; CHECK-NEXT:    v_readlane_b32 s93, v0, 62
-; CHECK-NEXT:    v_readlane_b32 s92, v0, 61
-; CHECK-NEXT:    v_readlane_b32 s91, v0, 60
-; CHECK-NEXT:    v_readlane_b32 s90, v0, 59
-; CHECK-NEXT:    v_readlane_b32 s89, v0, 58
-; CHECK-NEXT:    v_readlane_b32 s88, v0, 57
-; CHECK-NEXT:    v_readlane_b32 s87, v0, 56
-; CHECK-NEXT:    v_readlane_b32 s86, v0, 55
-; CHECK-NEXT:    v_readlane_b32 s85, v0, 54
-; CHECK-NEXT:    v_readlane_b32 s84, v0, 53
-; CHECK-NEXT:    v_readlane_b32 s83, v0, 52
-; CHECK-NEXT:    v_readlane_b32 s82, v0, 51
-; CHECK-NEXT:    v_readlane_b32 s81, v0, 50
-; CHECK-NEXT:    v_readlane_b32 s80, v0, 49
-; CHECK-NEXT:    v_readlane_b32 s79, v0, 48
-; CHECK-NEXT:    v_readlane_b32 s78, v0, 47
-; CHECK-NEXT:    v_readlane_b32 s77, v0, 46
-; CHECK-NEXT:    v_readlane_b32 s76, v0, 45
-; CHECK-NEXT:    v_readlane_b32 s75, v0, 44
-; CHECK-NEXT:    v_readlane_b32 s74, v0, 43
-; CHECK-NEXT:    v_readlane_b32 s73, v0, 42
-; CHECK-NEXT:    v_readlane_b32 s72, v0, 41
-; CHECK-NEXT:    v_readlane_b32 s71, v0, 40
-; CHECK-NEXT:    v_readlane_b32 s70, v0, 39
-; CHECK-NEXT:    v_readlane_b32 s69, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s68, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s67, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s66, v0, 35
-; CHECK-NEXT:    v_readlane_b32 s65, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s64, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s63, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s62, v0, 31
-; CHECK-NEXT:    v_readlane_b32 s61, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s60, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s59, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s58, v0, 27
-; CHECK-NEXT:    v_readlane_b32 s57, v0, 26
-; CHECK-NEXT:    v_readlane_b32 s56, v0, 25
-; CHECK-NEXT:    v_readlane_b32 s55, v0, 24
-; CHECK-NEXT:    v_readlane_b32 s54, v0, 23
-; CHECK-NEXT:    v_readlane_b32 s53, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s52, v0, 21
-; CHECK-NEXT:    v_readlane_b32 s51, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s50, v0, 19
-; CHECK-NEXT:    v_readlane_b32 s49, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s48, v0, 17
-; CHECK-NEXT:    v_readlane_b32 s47, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s46, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s45, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s44, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s43, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s42, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s41, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s40, v0, 9
-; CHECK-NEXT:    v_readlane_b32 s39, v0, 8
-; CHECK-NEXT:    v_readlane_b32 s38, v0, 7
-; CHECK-NEXT:    v_readlane_b32 s37, v0, 6
-; CHECK-NEXT:    v_readlane_b32 s36, v0, 5
-; CHECK-NEXT:    v_readlane_b32 s35, v0, 4
-; CHECK-NEXT:    v_readlane_b32 s34, v0, 3
-; CHECK-NEXT:    v_readlane_b32 s33, v0, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v0, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v0, 0
-; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    v_readlane_b32 s5, v1, 6
+; CHECK-NEXT:    v_readlane_b32 s101, v1, 4
+; CHECK-NEXT:    v_readlane_b32 s100, v1, 3
+; CHECK-NEXT:    v_readlane_b32 s99, v1, 2
+; CHECK-NEXT:    v_readlane_b32 s98, v1, 1
+; CHECK-NEXT:    v_readlane_b32 s97, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s96, v0, 63
+; CHECK-NEXT:    v_readlane_b32 s95, v0, 62
+; CHECK-NEXT:    v_readlane_b32 s94, v0, 61
+; CHECK-NEXT:    v_readlane_b32 s93, v0, 60
+; CHECK-NEXT:    v_readlane_b32 s92, v0, 59
+; CHECK-NEXT:    v_readlane_b32 s91, v0, 58
+; CHECK-NEXT:    v_readlane_b32 s90, v0, 57
+; CHECK-NEXT:    v_readlane_b32 s89, v0, 56
+; CHECK-NEXT:    v_readlane_b32 s88, v0, 55
+; CHECK-NEXT:    v_readlane_b32 s87, v0, 54
+; CHECK-NEXT:    v_readlane_b32 s86, v0, 53
+; CHECK-NEXT:    v_readlane_b32 s85, v0, 52
+; CHECK-NEXT:    v_readlane_b32 s84, v0, 51
+; CHECK-NEXT:    v_readlane_b32 s83, v0, 50
+; CHECK-NEXT:    v_readlane_b32 s82, v0, 49
+; CHECK-NEXT:    v_readlane_b32 s81, v0, 48
+; CHECK-NEXT:    v_readlane_b32 s80, v0, 47
+; CHECK-NEXT:    v_readlane_b32 s79, v0, 46
+; CHECK-NEXT:    v_readlane_b32 s78, v0, 45
+; CHECK-NEXT:    v_readlane_b32 s77, v0, 44
+; CHECK-NEXT:    v_readlane_b32 s76, v0, 43
+; CHECK-NEXT:    v_readlane_b32 s75, v0, 42
+; CHECK-NEXT:    v_readlane_b32 s74, v0, 41
+; CHECK-NEXT:    v_readlane_b32 s73, v0, 40
+; CHECK-NEXT:    v_readlane_b32 s72, v0, 39
+; CHECK-NEXT:    v_readlane_b32 s71, v0, 38
+; CHECK-NEXT:    v_readlane_b32 s70, v0, 37
+; CHECK-NEXT:    v_readlane_b32 s69, v0, 36
+; CHECK-NEXT:    v_readlane_b32 s68, v0, 35
+; CHECK-NEXT:    v_readlane_b32 s67, v0, 34
+; CHECK-NEXT:    v_readlane_b32 s66, v0, 33
+; CHECK-NEXT:    v_readlane_b32 s65, v0, 32
+; CHECK-NEXT:    v_readlane_b32 s64, v0, 31
+; CHECK-NEXT:    v_readlane_b32 s63, v0, 30
+; CHECK-NEXT:    v_readlane_b32 s62, v0, 29
+; CHECK-NEXT:    v_readlane_b32 s61, v0, 28
+; CHECK-NEXT:    v_readlane_b32 s60, v0, 27
+; CHECK-NEXT:    v_readlane_b32 s59, v0, 26
+; CHECK-NEXT:    v_readlane_b32 s58, v0, 25
+; CHECK-NEXT:    v_readlane_b32 s57, v0, 24
+; CHECK-NEXT:    v_readlane_b32 s56, v0, 23
+; CHECK-NEXT:    v_readlane_b32 s55, v0, 22
+; CHECK-NEXT:    v_readlane_b32 s54, v0, 21
+; CHECK-NEXT:    v_readlane_b32 s53, v0, 20
+; CHECK-NEXT:    v_readlane_b32 s52, v0, 19
+; CHECK-NEXT:    v_readlane_b32 s51, v0, 18
+; CHECK-NEXT:    v_readlane_b32 s50, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s49, v0, 16
+; CHECK-NEXT:    v_readlane_b32 s48, v0, 15
+; CHECK-NEXT:    v_readlane_b32 s47, v0, 14
+; CHECK-NEXT:    v_readlane_b32 s46, v0, 13
+; CHECK-NEXT:    v_readlane_b32 s45, v0, 12
+; CHECK-NEXT:    v_readlane_b32 s44, v0, 11
+; CHECK-NEXT:    v_readlane_b32 s43, v0, 10
+; CHECK-NEXT:    v_readlane_b32 s42, v0, 9
+; CHECK-NEXT:    v_readlane_b32 s41, v0, 8
+; CHECK-NEXT:    v_readlane_b32 s40, v0, 7
+; CHECK-NEXT:    v_readlane_b32 s39, v0, 6
+; CHECK-NEXT:    v_readlane_b32 s38, v0, 5
+; CHECK-NEXT:    v_readlane_b32 s37, v0, 4
+; CHECK-NEXT:    v_readlane_b32 s36, v0, 3
+; CHECK-NEXT:    v_readlane_b32 s35, v0, 2
+; CHECK-NEXT:    v_readlane_b32 s34, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s33, v0, 0
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %cnd = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
   %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0

diff  --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 1a9ec6c31555f..e91d62c4c3f2e 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -17,10 +17,9 @@ define void @use_vcc() #1 {
 ; GCN: v_writelane_b32 v40, s30, 0
 ; GCN: v_writelane_b32 v40, s31, 1
 ; GCN: s_swappc_b64
-; GCN: v_readlane_b32 s31, v40, 1
-; GCN: v_readlane_b32 s30, v40, 0
+; GCN: v_readlane_b32 s4, v40, 0
+; GCN: v_readlane_b32 s5, v40, 1
 ; GCN: v_readlane_b32 s33, v40, 2
-; GCN: s_setpc_b64 s[30:31]
 ; GCN: ; NumSgprs: 36
 ; GCN: ; NumVgprs: 41
 define void @indirect_use_vcc() #1 {

diff  --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index f3c875bd58039..82d879d769310 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -25,26 +25,26 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
 ; MUBUF:   buffer_store_dword
 ; FLATSCR: scratch_store_dword
 ; GCN: v_writelane_b32 v40, s33, 4
-; GCN: v_writelane_b32 v40, s30, 0
-; GCN: v_writelane_b32 v40, s31, 1
-; GCN: v_writelane_b32 v40, s34, 2
-; GCN: v_writelane_b32 v40, s35, 3
+; GCN: v_writelane_b32 v40, s34, 0
+; GCN: v_writelane_b32 v40, s35, 1
+; GCN: v_writelane_b32 v40, s30, 2
+; GCN: v_writelane_b32 v40, s31, 3
 
 ; GCN: s_swappc_b64
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_swappc_b64
-; GCN: v_readlane_b32 s35, v40, 3
-; GCN: v_readlane_b32 s34, v40, 2
-; MUBUF-DAG:   v_readlane_b32 s31, v40, 1
-; MUBUF-DAG:   v_readlane_b32 s30, v40, 0
-; FLATSCR-DAG: v_readlane_b32 s31, v40, 1
-; FLATSCR-DAG: v_readlane_b32 s30, v40, 0
+; MUBUF-DAG:   v_readlane_b32 s4, v40, 2
+; MUBUF-DAG:   v_readlane_b32 s5, v40, 3
+; FLATSCR-DAG: v_readlane_b32 s0, v40, 2
+; FLATSCR-DAG: v_readlane_b32 s1, v40, 3
+; GCN: v_readlane_b32 s35, v40, 1
+; GCN: v_readlane_b32 s34, v40, 0
 
 ; GCN: v_readlane_b32 s33, v40, 4
 ; MUBUF:   buffer_load_dword
 ; FLATSCR: scratch_load_dword
-; GCN: s_setpc_b64 s[30:31]
+; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
   call void @external_void_func_void()
   call void asm sideeffect "", ""() #0
@@ -74,14 +74,11 @@ define void @test_func_call_external_void_funcx2() #0 {
 
 ; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31:
 ; GCN: s_waitcnt
-; GCN: v_writelane_b32 v0, s30, 0
-; GCN: v_writelane_b32 v0, s31, 1
+; GCN-NEXT: s_mov_b64 [[SAVEPC:s\[[0-9]+:[0-9]+\]]], s[30:31]
 ; GCN-NEXT: #ASMSTART
 ; GCN: ; clobber
 ; GCN-NEXT: #ASMEND
-; GCN: v_readlane_b32 s31, v0, 1
-; GCN: v_readlane_b32 s30, v0, 0
-; GCN: s_setpc_b64 s[30:31]
+; GCN-NEXT: s_setpc_b64 [[SAVEPC]]
 define void @void_func_void_clobber_s30_s31() #2 {
   call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index e3e5f011ccfc4..e357b5adc8d42 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -102,8 +102,10 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
 
 ; GCN: s_swappc_b64
 
-; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]]
-; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]]
+; MUBUF-DAG: v_readlane_b32 s5, [[CSR_VGPR]]
+; MUBUF-DAG: v_readlane_b32 s4, [[CSR_VGPR]]
+; FLATSCR-DAG: v_readlane_b32 s0, [[CSR_VGPR]]
+; FLATSCR-DAG: v_readlane_b32 s1, [[CSR_VGPR]]
 
 ; MUBUF:    s_addk_i32 s32, 0xfc00{{$}}
 ; FLATSCR:  s_add_i32 s32, s32, -16{{$}}
@@ -114,7 +116,7 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_and_call() #0 {
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca
@@ -142,8 +144,10 @@ define void @callee_with_stack_and_call() #0 {
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
 ; GCN: s_swappc_b64
 
-; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0
-; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1
+; MUBUF-DAG: v_readlane_b32 s4, v40, 0
+; MUBUF-DAG: v_readlane_b32 s5, v40, 1
+; FLATSCR-DAG: v_readlane_b32 s0, v40, 0
+; FLATSCR-DAG: v_readlane_b32 s1, v40, 1
 
 ; MUBUF:   s_addk_i32 s32, 0xfc00
 ; FLATSCR: s_add_i32 s32, s32, -16
@@ -153,7 +157,7 @@ define void @callee_with_stack_and_call() #0 {
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-NEXT: s_setpc_b64
 define void @callee_no_stack_with_call() #0 {
   call void @external_void_func_void()
   ret void
@@ -389,28 +393,31 @@ define void @realign_stack_no_fp_elim() #1 {
 ; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
+; GCN-NEXT: v_writelane_b32 v0, s33, 2
 ; GCN-NEXT: s_mov_b32 s33, s32
-; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0
+; GCN-NEXT: v_writelane_b32 v0, s30, 0
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; MUBUF:        s_addk_i32 s32, 0x300
-; FLATSCR:      s_add_i32 s32, s32, 12
-; GCN: v_writelane_b32 [[CSR_VGPR]], s31, 1
+; GCN: v_writelane_b32 v0, s31, 1
 ; MUBUF:   buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}}
 ; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}}
 ; GCN-NEXT:     s_waitcnt vmcnt(0)
 ; GCN: ;;#ASMSTART
-; GCN:     v_readlane_b32 s31, [[CSR_VGPR]], 1
-; GCN:     v_readlane_b32 s30, [[CSR_VGPR]], 0
-; MUBUF:   s_addk_i32 s32, 0xfd00
-; FLATSCR: s_add_i32 s32, s32, -12
-; GCN-NEXT:     v_readlane_b32 s33, [[CSR_VGPR]], 2
+; MUBUF:        s_addk_i32 s32, 0x300
+; MUBUF-NEXT:   v_readlane_b32 s4, v0, 0
+; MUBUF-NEXT:   v_readlane_b32 s5, v0, 1
+; FLATSCR:      s_add_i32 s32, s32, 12
+; FLATSCR-NEXT: v_readlane_b32 s0, v0, 0
+; FLATSCR-NEXT: v_readlane_b32 s1, v0, 1
+; MUBUF-NEXT:   s_addk_i32 s32, 0xfd00
+; FLATSCR-NEXT: s_add_i32 s32, s32, -12
+; GCN-NEXT:     v_readlane_b32 s33, v0, 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; MUBUF-NEXT:   s_setpc_b64 s[4:5]
+; FLATSCR-NEXT: s_setpc_b64 s[0:1]
 define void @no_unused_non_csr_sgpr_for_fp() #1 {
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca
@@ -434,22 +441,28 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
 ; GCN-NEXT: s_mov_b32 s33, s32
-; MUBUF:       s_addk_i32 s32, 0x300{{$}}
-; FLATSCR:     s_add_i32 s32, s32, 12{{$}}
+; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
 
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
 ; MUBUF-DAG:   buffer_store_dword
 ; FLATSCR-DAG: scratch_store_dword
+; MUBUF:       s_addk_i32 s32, 0x300{{$}}
+; FLATSCR:     s_add_i32 s32, s32, 12{{$}}
 
+; MUBUF:        v_readlane_b32 s4, [[CSR_VGPR]], 0
+; FLATSCR:      v_readlane_b32 s0, [[CSR_VGPR]], 0
 ; GCN: ;;#ASMSTART
-; MUBUF:   s_addk_i32 s32, 0xfd00{{$}}
-; FLATSCR: s_add_i32 s32, s32, -12{{$}}
+; MUBUF:        v_readlane_b32 s5, [[CSR_VGPR]], 1
+; FLATSCR:      v_readlane_b32 s1, [[CSR_VGPR]], 1
+; MUBUF-NEXT:   s_addk_i32 s32, 0xfd00{{$}}
+; FLATSCR-NEXT: s_add_i32 s32, s32, -12{{$}}
 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-NEXT: s_setpc_b64
 define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca
@@ -481,15 +494,21 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
+; GCN-DAG:  v_writelane_b32 [[CSR_VGPR]], s30, 0
 ; GCN-DAG:  s_mov_b32 s33, s32
+; GCN-DAG:  v_writelane_b32 [[CSR_VGPR]], s31, 1
 ; MUBUF-DAG:   s_add_i32 s32, s32, 0x40300{{$}}
 ; FLATSCR-DAG: s_addk_i32 s32, 0x100c{{$}}
 ; MUBUF-DAG:   buffer_store_dword
 ; FLATSCR-DAG: scratch_store_dword
 
+; MUBUF:   v_readlane_b32 s4, [[CSR_VGPR]], 0
+; FLATSCR: v_readlane_b32 s0, [[CSR_VGPR]], 0
 ; GCN: ;;#ASMSTART
-; MUBUF:   s_add_i32 s32, s32, 0xfffbfd00{{$}}
-; FLATSCR: s_addk_i32 s32, 0xeff4{{$}}
+; MUBUF:   v_readlane_b32 s5, [[CSR_VGPR]], 1
+; FLATSCR: v_readlane_b32 s1, [[CSR_VGPR]], 1
+; MUBUF-NEXT:   s_add_i32 s32, s32, 0xfffbfd00{{$}}
+; FLATSCR-NEXT: s_addk_i32 s32, 0xeff4{{$}}
 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100
@@ -498,7 +517,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-NEXT: s_setpc_b64
 define void @scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval([4096 x i8]) align 4 %arg) #1 {
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca
@@ -530,7 +549,7 @@ define internal void @local_empty_func() #0 {
 ; An FP is needed, despite not needing any spills
 ; TODO: Ccould see callee does not use stack and omit FP.
 ; GCN-LABEL: {{^}}ipra_call_with_stack:
-; GCN: v_writelane_b32 v0, s33, 2
+; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
 ; GCN: s_mov_b32 s33, s32
 ; MUBUF:   s_addk_i32 s32, 0x400
 ; FLATSCR: s_add_i32 s32, s32, 16
@@ -539,7 +558,7 @@ define internal void @local_empty_func() #0 {
 ; GCN:     s_swappc_b64
 ; MUBUF:   s_addk_i32 s32, 0xfc00
 ; FLATSCR: s_add_i32 s32, s32, -16
-; GCN: v_readlane_b32 s33, v0, 2
+; GCN: s_mov_b32 s33, [[FP_COPY:s[0-9]+]]
 define void @ipra_call_with_stack() #0 {
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 980b0bc631af7..351aabf257389 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -317,7 +317,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 {
 ; GCN-NOT: s12
 ; GCN-NOT: s13
 ; GCN-NOT: s14
-; GCN: v_readlane_b32 s30, v40, 0
+; GCN: v_readlane_b32 s4, v40, 0
 define hidden void @func_indirect_use_workgroup_id_x() #1 {
   call void @use_workgroup_id_x()
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 29fc098899ee5..82dddcb6a1481 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -39,15 +39,15 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
 ; GCN-NEXT:    s_add_u32 s16, s16, func_v2f32 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v2f32 at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s4, v40, 0
+; GCN-NEXT:    v_readlane_b32 s5, v40, 1
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    v_readlane_b32 s33, v40, 2
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 bb0:
   %split.ret.type = call <2 x float> @func_v2f32()
   br label %bb1
@@ -73,15 +73,15 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
 ; GCN-NEXT:    s_add_u32 s16, s16, func_v3f32 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v3f32 at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s4, v40, 0
+; GCN-NEXT:    v_readlane_b32 s5, v40, 1
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    v_readlane_b32 s33, v40, 2
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 bb0:
   %split.ret.type = call <3 x float> @func_v3f32()
   br label %bb1
@@ -107,15 +107,15 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
 ; GCN-NEXT:    s_add_u32 s16, s16, func_v4f16 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_v4f16 at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s4, v40, 0
+; GCN-NEXT:    v_readlane_b32 s5, v40, 1
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    v_readlane_b32 s33, v40, 2
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 bb0:
   %split.ret.type = call <4 x half> @func_v4f16()
   br label %bb1
@@ -141,16 +141,16 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-NEXT:    s_add_u32 s16, s16, func_struct at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_struct at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    v_readlane_b32 s4, v40, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, v4
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s5, v40, 1
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    v_readlane_b32 s33, v40, 2
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 bb0:
   %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
   br label %bb1

diff  --git a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll
index cf610f9436acd..121d5d35bfe7a 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll
@@ -7,29 +7,32 @@
 define float @fdiv_f32(float %a, float %b) #0 {
   ; GCN-LABEL: name: fdiv_f32
   ; GCN: bb.0.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN-NEXT:   %4:vgpr_32, %5:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %8:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %8:vgpr_32, %9:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %10:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
   ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
-  ; GCN-NEXT:   %12:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %8, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %13:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %12, 0, %8, 0, %8, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %14:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %4, 0, %13, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %15:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %14, 0, %4, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %16:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %15, 0, %13, 0, %14, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %16, 0, %4, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %14:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %15:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %16:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %18:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %19:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
-  ; GCN-NEXT:   $vcc = COPY %5
-  ; GCN-NEXT:   %18:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %17, 0, %13, 0, %16, 0, 0, implicit $mode, implicit $vcc, implicit $exec
-  ; GCN-NEXT:   %19:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %18, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY %19
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   $vcc = COPY %7
+  ; GCN-NEXT:   %20:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec
+  ; GCN-NEXT:   %21:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN-NEXT:   $vgpr0 = COPY %21
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; GCN-NEXT:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0
 entry:
   %fdiv = fdiv float %a, %b
   ret float %fdiv
@@ -38,29 +41,32 @@ entry:
 define float @fdiv_nnan_f32(float %a, float %b) #0 {
   ; GCN-LABEL: name: fdiv_nnan_f32
   ; GCN: bb.0.entry:
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN-NEXT:   %4:vgpr_32, %5:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %8:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %8:vgpr_32, %9:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %10:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
   ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
-  ; GCN-NEXT:   %12:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %8, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %13:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %12, 0, %8, 0, %8, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %14:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %4, 0, %13, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %14, 0, %4, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %16:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %15, 0, %13, 0, %14, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %16, 0, %4, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %14:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %16:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %18:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   %19:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
-  ; GCN-NEXT:   $vcc = COPY %5
-  ; GCN-NEXT:   %18:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %17, 0, %13, 0, %16, 0, 0, implicit $mode, implicit $vcc, implicit $exec
-  ; GCN-NEXT:   %19:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %18, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY %19
-  ; GCN-NEXT:   SI_RETURN implicit $vgpr0
+  ; GCN-NEXT:   $vcc = COPY %7
+  ; GCN-NEXT:   %20:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec
+  ; GCN-NEXT:   %21:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN-NEXT:   $vgpr0 = COPY %21
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; GCN-NEXT:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0
 entry:
   %fdiv = fdiv nnan float %a, %b
   ret float %fdiv

diff  --git a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
index 47287f67b01ef..408e00f1f37d9 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
@@ -10,17 +10,17 @@
 define i32 @fp_save_restore_in_temp_sgpr(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 4 %arg) #0 {
   ; GCN-LABEL: name: fp_save_restore_in_temp_sgpr
   ; GCN: bb.0.begin:
-  ; GCN:   liveins: $sgpr11
+  ; GCN:   liveins: $sgpr11, $sgpr30_sgpr31
   ; GCN:   $sgpr11 = frame-setup COPY $sgpr33
   ; GCN:   $sgpr33 = frame-setup COPY $sgpr32
   ; GCN: bb.1.lp_end:
-  ; GCN:   liveins: $sgpr10, $sgpr11, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
+  ; GCN:   liveins: $sgpr10, $sgpr11, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr30_sgpr31
   ; GCN: bb.2.lp_begin:
-  ; GCN:   liveins: $sgpr10, $sgpr11, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7
+  ; GCN:   liveins: $sgpr10, $sgpr11, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr30_sgpr31
   ; GCN: bb.3.Flow:
-  ; GCN:   liveins: $sgpr10, $sgpr11, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
+  ; GCN:   liveins: $sgpr10, $sgpr11, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr30_sgpr31
   ; GCN: bb.4.end:
-  ; GCN:   liveins: $sgpr11, $vgpr0, $sgpr4_sgpr5
+  ; GCN:   liveins: $sgpr11, $vgpr0, $sgpr4_sgpr5, $sgpr30_sgpr31
   ; GCN:   $sgpr33 = frame-destroy COPY $sgpr11
 begin:
   br label %lp_begin

diff  --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
index bf783cba5ace7..ea461cab629df 100644
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -202,9 +202,9 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v2
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -220,9 +220,9 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -238,9 +238,9 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v2, v3, v2
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT:    v_exp_f32_e32 v1, v2
+; GFX90A-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
@@ -302,9 +302,9 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v2
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -320,9 +320,9 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -338,9 +338,9 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v2, v3, v2
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT:    v_exp_f32_e32 v1, v2
+; GFX90A-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
@@ -403,9 +403,9 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v2
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -421,9 +421,9 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -439,9 +439,9 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v2, v3, v2
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT:    v_exp_f32_e32 v1, v2
+; GFX90A-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
@@ -509,9 +509,9 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v2
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -527,9 +527,9 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -545,9 +545,9 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v2, v3, v2
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT:    v_exp_f32_e32 v1, v2
+; GFX90A-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 450cae8a4a388..e7121cec90c8a 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -25,15 +25,15 @@ define void @callee_with_stack_and_call() #0 {
 ; SPILL-TO-VGPR-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; SPILL-TO-VGPR-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v40, 1
-; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v40, 0
+; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s4, v40, 0
+; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s5, v40, 1
 ; SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0xfc00
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s33, v40, 2
-; SPILL-TO-VGPR-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SPILL-TO-VGPR-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; SPILL-TO-VGPR-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
+; SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[6:7]
 ; SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; SPILL-TO-VGPR-NEXT:    s_setpc_b64 s[30:31]
+; SPILL-TO-VGPR-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; NO-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call:
 ; NO-SPILL-TO-VGPR:       ; %bb.0:
@@ -43,21 +43,14 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b32 s33, s32
 ; NO-SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0x800
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[8:9], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 3
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v1, s30, 0
+; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v1, s31, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[8:9]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[8:9], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v1, s31, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[8:9]
 ; NO-SPILL-TO-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
@@ -65,29 +58,21 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; NO-SPILL-TO-VGPR-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; NO-SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v1, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[6:7], exec
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 3
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v1, 0
+; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s4, v1, 0
+; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s5, v1, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[6:7]
 ; NO-SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0xf800
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    v_readfirstlane_b32 s33, v0
-; NO-SPILL-TO-VGPR-NEXT:    s_setpc_b64 s[30:31]
+; NO-SPILL-TO-VGPR-NEXT:    s_setpc_b64 s[4:5]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca
   call void @external_void_func_void()

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 940614151e2d5..5214a02b9f74f 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -949,24 +949,24 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
 ; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v7
 ; GFX10-NEXT:    v_lshrrev_b16 v7, v7, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
-; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX10-NEXT:    v_xor_b32_e32 v12, -1, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v6, v9, v6
-; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v5
+; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_lshlrev_b16 v12, 1, v12
+; GFX10-NEXT:    v_lshlrev_b16 v8, 1, v8
 ; GFX10-NEXT:    v_xor_b32_e32 v13, -1, v11
-; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
-; GFX10-NEXT:    v_lshlrev_b16 v1, v9, v1
+; GFX10-NEXT:    v_lshlrev_b16 v0, v9, v0
+; GFX10-NEXT:    v_lshlrev_b16 v1, v12, v1
 ; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v4, v11, v10
-; GFX10-NEXT:    v_lshlrev_b16 v5, v13, v12
+; GFX10-NEXT:    v_lshlrev_b16 v5, v13, v8
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 44450ffe44acc..989e8bfe7f564 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -105,13 +105,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i1 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i1 at rel32@hi+12
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -132,15 +132,15 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i1 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i1 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i1 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i1 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -169,8 +169,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -197,14 +197,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i1_signext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_signext at rel32@hi+12
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -226,16 +226,16 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i1_signext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i1_signext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i1_signext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_signext at rel32@hi+12
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -266,8 +266,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -295,14 +295,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i1_zeroext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i1_zeroext at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i1_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_zeroext at rel32@hi+12
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -324,16 +324,16 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i1_zeroext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i1_zeroext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i1_zeroext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_zeroext at rel32@hi+12
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -364,8 +364,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -392,12 +392,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i8 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i8 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i8 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i8 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -418,14 +418,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i8 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i8 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i8 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i8 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -453,8 +453,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -481,12 +481,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i8_signext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i8_signext at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i8_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_signext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -508,14 +508,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i8_signext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i8_signext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i8_signext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_signext at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -544,8 +544,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -573,12 +573,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i8_zeroext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i8_zeroext at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i8_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_zeroext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -600,14 +600,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i8_zeroext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i8_zeroext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i8_zeroext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_zeroext at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -636,8 +636,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -664,12 +664,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -690,14 +690,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -725,8 +725,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -753,12 +753,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i16_signext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i16_signext at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i16_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_signext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -780,14 +780,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i16_signext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i16_signext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i16_signext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_signext at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -816,8 +816,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -845,12 +845,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i16_zeroext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i16_zeroext at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i16_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_zeroext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -872,14 +872,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i16_zeroext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i16_zeroext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i16_zeroext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_zeroext at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -908,8 +908,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -936,12 +936,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -962,14 +962,14 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -997,8 +997,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1025,12 +1025,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -1053,13 +1053,13 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -1088,8 +1088,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1117,12 +1117,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -1144,15 +1144,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i64 at rel32@hi+12
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -1182,8 +1182,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1213,12 +1213,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -1243,13 +1243,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -1280,8 +1280,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1311,12 +1311,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -1342,13 +1342,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -1380,8 +1380,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1416,12 +1416,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -1449,13 +1449,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -1489,8 +1489,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1518,12 +1518,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_f16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -1544,14 +1544,14 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -1579,8 +1579,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1606,12 +1606,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 4.0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_f32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -1632,14 +1632,14 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 4.0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -1667,8 +1667,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1695,12 +1695,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -1723,13 +1723,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -1758,8 +1758,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1787,12 +1787,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -1816,13 +1816,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -1852,8 +1852,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1883,12 +1883,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, -1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0.5
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v5f32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v5f32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v5f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v5f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -1914,13 +1914,13 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0.5
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v5f32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v5f32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v5f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v5f32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -1952,8 +1952,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v5f32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1980,12 +1980,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_f64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -2008,13 +2008,13 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -2043,8 +2043,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_f64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -2073,12 +2073,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -2103,13 +2103,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -2140,8 +2140,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -2172,12 +2172,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x40200000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -2205,12 +2205,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f64 at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -2243,8 +2243,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f64 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -2270,12 +2270,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -2296,14 +2296,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -2331,8 +2331,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -2359,12 +2359,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -2385,14 +2385,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -2420,8 +2420,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -2448,12 +2448,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -2474,14 +2474,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -2509,8 +2509,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -2538,12 +2538,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -2566,13 +2566,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -2601,8 +2601,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -2629,12 +2629,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -2657,13 +2657,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -2692,8 +2692,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -2719,12 +2719,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -2745,14 +2745,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -2780,8 +2780,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -2809,12 +2809,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40003
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -2837,13 +2837,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -2872,8 +2872,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -2899,12 +2899,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -2925,14 +2925,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -2960,8 +2960,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -2988,12 +2988,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -3014,14 +3014,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -3049,8 +3049,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -3078,12 +3078,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -3106,13 +3106,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -3141,8 +3141,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -3170,12 +3170,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -3199,13 +3199,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -3235,8 +3235,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -3265,12 +3265,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -3295,13 +3295,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -3332,8 +3332,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32_i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -3359,12 +3359,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -3385,14 +3385,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -3420,8 +3420,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -3451,12 +3451,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -3481,13 +3481,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -3518,8 +3518,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -3549,12 +3549,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v5i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v5i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v5i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v5i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -3580,13 +3580,13 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 5
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v5i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v5i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v5i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v5i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -3618,8 +3618,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v5i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -3639,22 +3639,22 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[34:35]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[34:35] offset:16
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v8i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[30:31]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[30:31] offset:16
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -3671,23 +3671,24 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[34:35]
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[34:35] offset:16
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v8i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[30:31]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[30:31] offset:16
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -3720,8 +3721,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -3756,12 +3757,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 7
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 8
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v8i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -3791,12 +3792,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v8i32 at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -3831,8 +3832,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -3852,24 +3853,24 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[34:35]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[34:35] offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v16, s[34:35] offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v16, s[34:35] offset:48
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v16i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v16i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v16, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[30:31]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[30:31] offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v16, s[30:31] offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v16, s[30:31] offset:48
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v16i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v16i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -3886,25 +3887,26 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[34:35]
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[34:35] offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v16, s[34:35] offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v16, s[34:35] offset:48
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v16i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v16i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[30:31]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[30:31] offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v16, s[30:31] offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v16, s[30:31] offset:48
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v16i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v16i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -3939,8 +3941,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -3962,29 +3964,29 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[34:35]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[34:35] offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[34:35] offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[34:35] offset:48
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[34:35] offset:64
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[34:35] offset:80
-; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[34:35] offset:96
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[30:31]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[30:31] offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[30:31] offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[30:31] offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[30:31] offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[30:31] offset:80
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[30:31] offset:96
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[34:35] offset:112
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v32i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v32i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[30:31] offset:112
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -4001,29 +4003,30 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v32, s[34:35]
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v32, s[34:35] offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v32, s[34:35] offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v32, s[34:35] offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v32, s[34:35] offset:64
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
-; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
-; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v32i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v32i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v32, s[30:31]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v32, s[30:31] offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v32, s[30:31] offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v32, s[30:31] offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v32, s[30:31] offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[30:31] offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[30:31] offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[30:31] offset:112
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -4062,8 +4065,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -4085,32 +4088,32 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    global_load_dword v32, v[0:1], off
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[34:35]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[34:35] offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[34:35] offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[34:35] offset:48
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[34:35] offset:64
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[34:35] offset:80
-; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[34:35] offset:96
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[30:31]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[30:31] offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[30:31] offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[30:31] offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[30:31] offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[30:31] offset:80
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[30:31] offset:96
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[34:35] offset:112
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[30:31] offset:112
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v32i32_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32 at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -4127,32 +4130,33 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    global_load_dword v33, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v32, s[34:35]
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v32, s[34:35] offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v32, s[34:35] offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v32, s[34:35] offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v32, s[34:35] offset:64
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
-; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
-; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v32i32_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v32i32_i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v32, s[30:31]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v32, s[30:31] offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v32, s[30:31] offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v32, s[30:31] offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v32, s[30:31] offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[30:31] offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[30:31] offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[30:31] offset:112
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32 at rel32@hi+12
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -4194,8 +4198,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v33, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -4228,16 +4232,16 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v42, v1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_i32_func_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_i32_func_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_i32_func_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_i32_func_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    global_store_dword v[41:42], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -4263,18 +4267,18 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_mov_b32_e32 v42, v1
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_i32_func_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_i32_func_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_i32_func_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_i32_func_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    global_store_dword v[41:42], v0, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -4311,8 +4315,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -4333,22 +4337,22 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v2, s[34:35]
-; GFX9-NEXT:    global_load_dword v1, v2, s[34:35] offset:4
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_struct_i8_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_struct_i8_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    global_load_ubyte v0, v2, s[30:31]
+; GFX9-NEXT:    global_load_dword v1, v2, s[30:31] offset:4
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -4365,23 +4369,24 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_ubyte v0, v2, s[34:35]
-; GFX10-NEXT:    global_load_dword v1, v2, s[34:35] offset:4
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_struct_i8_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_struct_i8_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_ubyte v0, v2, s[30:31]
+; GFX10-NEXT:    global_load_dword v1, v2, s[30:31] offset:4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_struct_i8_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -4414,8 +4419,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_struct_i8_i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -4447,12 +4452,12 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -4478,13 +4483,13 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -4516,8 +4521,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -4554,14 +4559,14 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    v_add_u32_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4593,17 +4598,17 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v1, 5, s33
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 8, v0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -4645,8 +4650,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_ubyte v0, off, s33 offset:8
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v1, off, s33 offset:12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
@@ -4684,18 +4689,18 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[34:35]
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v16i8 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v16i8 at rel32@hi+12
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v16i8 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v16i8 at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
@@ -4715,9 +4720,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v16
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v18
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -4734,19 +4739,19 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[34:35]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[30:31]
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v16i8 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v16i8 at rel32@hi+12
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v16i8 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v16i8 at rel32@hi+12
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
@@ -4766,9 +4771,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v16
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v18
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -4819,8 +4824,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -4842,91 +4847,88 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 32
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 30
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:20
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:16
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s40, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s41, 9
-; GFX9-NEXT:    v_writelane_b32 v40, s42, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s43, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s44, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s45, 13
-; GFX9-NEXT:    v_writelane_b32 v40, s46, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s47, 15
-; GFX9-NEXT:    v_writelane_b32 v40, s48, 16
-; GFX9-NEXT:    v_writelane_b32 v40, s49, 17
-; GFX9-NEXT:    v_writelane_b32 v40, s50, 18
-; GFX9-NEXT:    v_writelane_b32 v40, s51, 19
-; GFX9-NEXT:    v_writelane_b32 v40, s52, 20
-; GFX9-NEXT:    v_writelane_b32 v40, s53, 21
-; GFX9-NEXT:    v_writelane_b32 v40, s54, 22
-; GFX9-NEXT:    v_writelane_b32 v40, s55, 23
-; GFX9-NEXT:    v_writelane_b32 v40, s56, 24
-; GFX9-NEXT:    v_writelane_b32 v40, s57, 25
-; GFX9-NEXT:    v_writelane_b32 v40, s58, 26
-; GFX9-NEXT:    v_writelane_b32 v40, s59, 27
-; GFX9-NEXT:    v_writelane_b32 v40, s60, 28
-; GFX9-NEXT:    v_writelane_b32 v40, s61, 29
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s40, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s41, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s42, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s43, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s44, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s45, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s46, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s47, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 16
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 17
+; GFX9-NEXT:    v_writelane_b32 v40, s52, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s53, 19
+; GFX9-NEXT:    v_writelane_b32 v40, s54, 20
+; GFX9-NEXT:    v_writelane_b32 v40, s55, 21
+; GFX9-NEXT:    v_writelane_b32 v40, s56, 22
+; GFX9-NEXT:    v_writelane_b32 v40, s57, 23
+; GFX9-NEXT:    v_writelane_b32 v40, s58, 24
+; GFX9-NEXT:    v_writelane_b32 v40, s59, 25
+; GFX9-NEXT:    v_writelane_b32 v40, s60, 26
+; GFX9-NEXT:    v_writelane_b32 v40, s61, 27
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s62, 30
-; GFX9-NEXT:    v_writelane_b32 v40, s63, 31
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s62, 28
+; GFX9-NEXT:    v_writelane_b32 v40, s63, 29
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, byval_align16_f64_arg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, byval_align16_f64_arg at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s63, v40, 31
-; GFX9-NEXT:    v_readlane_b32 s62, v40, 30
-; GFX9-NEXT:    v_readlane_b32 s61, v40, 29
-; GFX9-NEXT:    v_readlane_b32 s60, v40, 28
-; GFX9-NEXT:    v_readlane_b32 s59, v40, 27
-; GFX9-NEXT:    v_readlane_b32 s58, v40, 26
-; GFX9-NEXT:    v_readlane_b32 s57, v40, 25
-; GFX9-NEXT:    v_readlane_b32 s56, v40, 24
-; GFX9-NEXT:    v_readlane_b32 s55, v40, 23
-; GFX9-NEXT:    v_readlane_b32 s54, v40, 22
-; GFX9-NEXT:    v_readlane_b32 s53, v40, 21
-; GFX9-NEXT:    v_readlane_b32 s52, v40, 20
-; GFX9-NEXT:    v_readlane_b32 s51, v40, 19
-; GFX9-NEXT:    v_readlane_b32 s50, v40, 18
-; GFX9-NEXT:    v_readlane_b32 s49, v40, 17
-; GFX9-NEXT:    v_readlane_b32 s48, v40, 16
-; GFX9-NEXT:    v_readlane_b32 s47, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s46, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s45, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s44, v40, 12
-; GFX9-NEXT:    v_readlane_b32 s43, v40, 11
-; GFX9-NEXT:    v_readlane_b32 s42, v40, 10
-; GFX9-NEXT:    v_readlane_b32 s41, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s40, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s63, v40, 29
+; GFX9-NEXT:    v_readlane_b32 s62, v40, 28
+; GFX9-NEXT:    v_readlane_b32 s61, v40, 27
+; GFX9-NEXT:    v_readlane_b32 s60, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s59, v40, 25
+; GFX9-NEXT:    v_readlane_b32 s58, v40, 24
+; GFX9-NEXT:    v_readlane_b32 s57, v40, 23
+; GFX9-NEXT:    v_readlane_b32 s56, v40, 22
+; GFX9-NEXT:    v_readlane_b32 s55, v40, 21
+; GFX9-NEXT:    v_readlane_b32 s54, v40, 20
+; GFX9-NEXT:    v_readlane_b32 s53, v40, 19
+; GFX9-NEXT:    v_readlane_b32 s52, v40, 18
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s47, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s46, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s45, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s44, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s43, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s42, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s41, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s40, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 32
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 30
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GFX10-LABEL: tail_call_byval_align16:
 ; GFX10:       ; %bb.0: ; %entry
@@ -4936,93 +4938,90 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 30
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:20
 ; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:16
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s33
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX10-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, byval_align16_f64_arg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, byval_align16_f64_arg at rel32@hi+12
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX10-NEXT:    v_writelane_b32 v40, s40, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s41, 9
-; GFX10-NEXT:    v_writelane_b32 v40, s42, 10
-; GFX10-NEXT:    v_writelane_b32 v40, s43, 11
-; GFX10-NEXT:    v_writelane_b32 v40, s44, 12
-; GFX10-NEXT:    v_writelane_b32 v40, s45, 13
-; GFX10-NEXT:    v_writelane_b32 v40, s46, 14
-; GFX10-NEXT:    v_writelane_b32 v40, s47, 15
-; GFX10-NEXT:    v_writelane_b32 v40, s48, 16
-; GFX10-NEXT:    v_writelane_b32 v40, s49, 17
-; GFX10-NEXT:    v_writelane_b32 v40, s50, 18
-; GFX10-NEXT:    v_writelane_b32 v40, s51, 19
-; GFX10-NEXT:    v_writelane_b32 v40, s52, 20
-; GFX10-NEXT:    v_writelane_b32 v40, s53, 21
-; GFX10-NEXT:    v_writelane_b32 v40, s54, 22
-; GFX10-NEXT:    v_writelane_b32 v40, s55, 23
-; GFX10-NEXT:    v_writelane_b32 v40, s56, 24
-; GFX10-NEXT:    v_writelane_b32 v40, s57, 25
-; GFX10-NEXT:    v_writelane_b32 v40, s58, 26
-; GFX10-NEXT:    v_writelane_b32 v40, s59, 27
-; GFX10-NEXT:    v_writelane_b32 v40, s60, 28
-; GFX10-NEXT:    v_writelane_b32 v40, s61, 29
-; GFX10-NEXT:    v_writelane_b32 v40, s62, 30
-; GFX10-NEXT:    v_writelane_b32 v40, s63, 31
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s63, v40, 31
-; GFX10-NEXT:    v_readlane_b32 s62, v40, 30
-; GFX10-NEXT:    v_readlane_b32 s61, v40, 29
-; GFX10-NEXT:    v_readlane_b32 s60, v40, 28
-; GFX10-NEXT:    v_readlane_b32 s59, v40, 27
-; GFX10-NEXT:    v_readlane_b32 s58, v40, 26
-; GFX10-NEXT:    v_readlane_b32 s57, v40, 25
-; GFX10-NEXT:    v_readlane_b32 s56, v40, 24
-; GFX10-NEXT:    v_readlane_b32 s55, v40, 23
-; GFX10-NEXT:    v_readlane_b32 s54, v40, 22
-; GFX10-NEXT:    v_readlane_b32 s53, v40, 21
-; GFX10-NEXT:    v_readlane_b32 s52, v40, 20
-; GFX10-NEXT:    v_readlane_b32 s51, v40, 19
-; GFX10-NEXT:    v_readlane_b32 s50, v40, 18
-; GFX10-NEXT:    v_readlane_b32 s49, v40, 17
-; GFX10-NEXT:    v_readlane_b32 s48, v40, 16
-; GFX10-NEXT:    v_readlane_b32 s47, v40, 15
-; GFX10-NEXT:    v_readlane_b32 s46, v40, 14
-; GFX10-NEXT:    v_readlane_b32 s45, v40, 13
-; GFX10-NEXT:    v_readlane_b32 s44, v40, 12
-; GFX10-NEXT:    v_readlane_b32 s43, v40, 11
-; GFX10-NEXT:    v_readlane_b32 s42, v40, 10
-; GFX10-NEXT:    v_readlane_b32 s41, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s40, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s40, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s41, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s42, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s43, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s44, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s45, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s46, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s47, 13
+; GFX10-NEXT:    v_writelane_b32 v40, s48, 14
+; GFX10-NEXT:    v_writelane_b32 v40, s49, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s50, 16
+; GFX10-NEXT:    v_writelane_b32 v40, s51, 17
+; GFX10-NEXT:    v_writelane_b32 v40, s52, 18
+; GFX10-NEXT:    v_writelane_b32 v40, s53, 19
+; GFX10-NEXT:    v_writelane_b32 v40, s54, 20
+; GFX10-NEXT:    v_writelane_b32 v40, s55, 21
+; GFX10-NEXT:    v_writelane_b32 v40, s56, 22
+; GFX10-NEXT:    v_writelane_b32 v40, s57, 23
+; GFX10-NEXT:    v_writelane_b32 v40, s58, 24
+; GFX10-NEXT:    v_writelane_b32 v40, s59, 25
+; GFX10-NEXT:    v_writelane_b32 v40, s60, 26
+; GFX10-NEXT:    v_writelane_b32 v40, s61, 27
+; GFX10-NEXT:    v_writelane_b32 v40, s62, 28
+; GFX10-NEXT:    v_writelane_b32 v40, s63, 29
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s63, v40, 29
+; GFX10-NEXT:    v_readlane_b32 s62, v40, 28
+; GFX10-NEXT:    v_readlane_b32 s61, v40, 27
+; GFX10-NEXT:    v_readlane_b32 s60, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s59, v40, 25
+; GFX10-NEXT:    v_readlane_b32 s58, v40, 24
+; GFX10-NEXT:    v_readlane_b32 s57, v40, 23
+; GFX10-NEXT:    v_readlane_b32 s56, v40, 22
+; GFX10-NEXT:    v_readlane_b32 s55, v40, 21
+; GFX10-NEXT:    v_readlane_b32 s54, v40, 20
+; GFX10-NEXT:    v_readlane_b32 s53, v40, 19
+; GFX10-NEXT:    v_readlane_b32 s52, v40, 18
+; GFX10-NEXT:    v_readlane_b32 s51, v40, 17
+; GFX10-NEXT:    v_readlane_b32 s50, v40, 16
+; GFX10-NEXT:    v_readlane_b32 s49, v40, 15
+; GFX10-NEXT:    v_readlane_b32 s48, v40, 14
+; GFX10-NEXT:    v_readlane_b32 s47, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s46, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s45, v40, 11
+; GFX10-NEXT:    v_readlane_b32 s44, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s43, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s42, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s41, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s40, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 30
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GFX10-SCRATCH-LABEL: tail_call_byval_align16:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
@@ -5032,90 +5031,87 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 offset:24 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, s33
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 30
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dwordx2 v[32:33], off, s33 offset:16
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v31, off, s33
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b64 s[4:5], s[30:31]
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, byval_align16_f64_arg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, byval_align16_f64_arg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s35, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 7
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s40, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s41, 9
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s42, 10
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s43, 11
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s44, 12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s45, 13
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s46, 14
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s47, 15
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 17
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 18
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 19
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 20
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 21
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 22
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 23
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s56, 24
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s57, 25
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s58, 26
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s59, 27
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s60, 28
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s61, 29
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s62, 30
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s63, 31
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s41, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s42, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s43, 9
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s44, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s45, 11
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s46, 12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s47, 13
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 14
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 15
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 17
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 18
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 19
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 20
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 21
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s56, 22
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s57, 23
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s58, 24
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s59, 25
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s60, 26
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s61, 27
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s62, 28
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s63, 29
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s63, v40, 31
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s62, v40, 30
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s61, v40, 29
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s60, v40, 28
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s59, v40, 27
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s58, v40, 26
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s57, v40, 25
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s56, v40, 24
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 23
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 22
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 21
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 20
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 19
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 18
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 17
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s47, v40, 15
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s46, v40, 14
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s45, v40, 13
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s44, v40, 12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s43, v40, 11
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s42, v40, 10
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s41, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s40, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s63, v40, 29
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s62, v40, 28
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s61, v40, 27
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s60, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s59, v40, 25
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s58, v40, 24
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s57, v40, 23
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s56, v40, 22
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 21
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 20
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 19
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 18
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 17
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 16
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 15
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 14
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s47, v40, 13
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s46, v40, 12
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s45, v40, 11
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s44, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s43, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s42, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s41, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s40, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 30
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 offset:24 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca double, align 8, addrspace(5)
   tail call amdgpu_gfx void @byval_align16_f64_arg(<32 x i32> %val, double addrspace(5)* byval(double) align 16 %alloca)
@@ -5136,13 +5132,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i1_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i1_inreg at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i1_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_inreg at rel32@hi+12
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -5163,15 +5159,15 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i1_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i1_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i1_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -5200,8 +5196,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -5228,12 +5224,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i8_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i8_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i8_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -5254,16 +5250,16 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i8_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i8_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i8_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
@@ -5293,8 +5289,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
@@ -5322,12 +5318,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -5348,16 +5344,16 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i16_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
@@ -5387,8 +5383,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
@@ -5416,12 +5412,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s4, 42
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -5442,16 +5438,16 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 42
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
@@ -5481,8 +5477,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
@@ -5512,12 +5508,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -5539,18 +5535,18 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_i64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_i64_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -5583,8 +5579,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -5610,19 +5606,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    s_mov_b64 s[30:31], 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5644,22 +5640,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 6
-; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i64_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    s_mov_b64 s[30:31], 0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5696,8 +5692,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5735,12 +5731,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s6, 3
 ; GFX9-NEXT:    s_mov_b32 s7, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5764,9 +5760,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i64_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
@@ -5777,9 +5770,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s7, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5818,8 +5814,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5847,23 +5843,23 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX9-NEXT:    s_mov_b64 s[30:31], 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX9-NEXT:    s_mov_b32 s8, 1
 ; GFX9-NEXT:    s_mov_b32 s9, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
@@ -5887,26 +5883,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 8
-; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i64_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-NEXT:    s_mov_b32 s8, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-NEXT:    s_mov_b32 s9, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
+; GFX10-NEXT:    s_mov_b64 s[30:31], 0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
@@ -5949,8 +5945,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
@@ -5984,26 +5980,26 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX9-NEXT:    s_mov_b64 s[30:31], 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX9-NEXT:    s_mov_b32 s8, 1
 ; GFX9-NEXT:    s_mov_b32 s9, 2
 ; GFX9-NEXT:    s_mov_b32 s10, 3
 ; GFX9-NEXT:    s_mov_b32 s11, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -6029,17 +6025,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 10
-; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i64_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-NEXT:    s_mov_b32 s8, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
@@ -6050,9 +6041,14 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s11, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX10-NEXT:    s_mov_b64 s[30:31], 0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -6101,8 +6097,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -6139,12 +6135,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x4400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -6165,16 +6161,16 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f16_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_movk_i32 s4, 0x4400
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
@@ -6204,8 +6200,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
@@ -6233,12 +6229,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s4, 4.0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -6259,16 +6255,16 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 4.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
@@ -6298,8 +6294,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
@@ -6329,12 +6325,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s4, 1.0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -6356,18 +6352,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -6400,8 +6396,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -6434,12 +6430,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    s_mov_b32 s6, 4.0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
@@ -6462,9 +6458,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 5
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
@@ -6473,9 +6466,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s6, 4.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
@@ -6511,8 +6507,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
@@ -6550,12 +6546,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s7, -1.0
 ; GFX9-NEXT:    s_mov_b32 s8, 0.5
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v5f32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v5f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v5f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v5f32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
@@ -6580,9 +6576,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 7
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v5f32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v5f32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
@@ -6595,9 +6588,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s8, 0.5
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v5f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v5f32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
@@ -6639,8 +6635,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
@@ -6674,12 +6670,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    s_mov_b32 s5, 0x40100000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_f64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_f64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -6701,18 +6697,18 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_f64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_f64_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 0x40100000
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -6745,8 +6741,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -6781,12 +6777,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6810,9 +6806,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f64_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
@@ -6823,9 +6816,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s7, 0x40100000
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6864,8 +6860,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6906,12 +6902,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s8, 0
 ; GFX9-NEXT:    s_mov_b32 s9, 0x40200000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
@@ -6937,9 +6933,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 8
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f64_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
@@ -6954,9 +6947,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s9, 0x40200000
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
@@ -7001,8 +6997,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
@@ -7030,17 +7026,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    s_load_dword s4, s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_load_dword s4, s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -7062,15 +7058,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_load_dword s4, s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i16_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    s_load_dword s4, s[30:31], 0x0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
@@ -7100,8 +7096,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
@@ -7126,17 +7122,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -7160,15 +7156,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i16_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7200,8 +7196,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7227,17 +7223,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -7261,15 +7257,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f16_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7301,8 +7297,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7334,12 +7330,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX9-NEXT:    s_mov_b32 s5, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -7361,18 +7357,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i16_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7405,8 +7401,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7437,12 +7433,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s4, 0x40003c00
 ; GFX9-NEXT:    s_movk_i32 s5, 0x4400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -7464,18 +7460,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3f16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3f16_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0x40003c00
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_movk_i32 s5, 0x4400
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7508,8 +7504,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7534,17 +7530,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -7568,15 +7564,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i16_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7608,8 +7604,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7641,12 +7637,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX9-NEXT:    s_mov_b32 s5, 0x40003
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -7668,18 +7664,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i16_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 0x40003
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7712,8 +7708,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7737,17 +7733,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    s_load_dword s4, s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_load_dword s4, s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -7769,15 +7765,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_load_dword s4, s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2f16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2f16_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    s_load_dword s4, s[30:31], 0x0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
@@ -7807,8 +7803,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
@@ -7833,17 +7829,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -7867,15 +7863,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -7907,8 +7903,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -7940,12 +7936,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -7967,18 +7963,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v2i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v2i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -8011,8 +8007,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -8045,12 +8041,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s5, 4
 ; GFX9-NEXT:    s_mov_b32 s6, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
@@ -8073,9 +8069,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 5
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
@@ -8084,9 +8077,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s6, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
@@ -8122,8 +8118,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
@@ -8159,12 +8155,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s6, 5
 ; GFX9-NEXT:    s_mov_b32 s7, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8188,9 +8184,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i32_i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
@@ -8201,9 +8194,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s7, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8242,8 +8238,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8272,17 +8268,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8310,15 +8306,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8354,8 +8350,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8393,12 +8389,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s6, 3
 ; GFX9-NEXT:    s_mov_b32 s7, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8422,9 +8418,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
@@ -8435,9 +8428,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s7, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8476,8 +8472,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8516,12 +8512,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s7, 4
 ; GFX9-NEXT:    s_mov_b32 s8, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v5i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v5i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v5i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v5i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
@@ -8546,9 +8542,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 7
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v5i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v5i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
@@ -8561,9 +8554,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s8, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v5i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v5i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
@@ -8605,8 +8601,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
@@ -8635,24 +8631,24 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[34:35], 0x0
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[30:31], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -8678,7 +8674,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 10
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
@@ -8689,16 +8684,17 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v8i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -8744,8 +8740,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -8796,12 +8792,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s10, 7
 ; GFX9-NEXT:    s_mov_b32 s11, 8
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
@@ -8829,9 +8825,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 10
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v8i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
@@ -8850,9 +8843,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s11, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
@@ -8903,8 +8899,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
@@ -8944,24 +8940,24 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s12, 8
 ; GFX9-NEXT:    v_writelane_b32 v40, s13, 9
 ; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v16i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v16i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v16i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v16i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
@@ -8995,7 +8991,6 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 18
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
@@ -9014,16 +9009,17 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v16i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v16i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v16i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v16i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
@@ -9085,8 +9081,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
@@ -9141,24 +9137,29 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX9-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX9-NEXT:    v_writelane_b32 v40, s27, 23
+; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
+; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[30:31], 0x40
+; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s46
-; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s47
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s49
@@ -9167,7 +9168,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s50
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s51
 ; GFX9-NEXT:    s_mov_b32 s20, s36
@@ -9180,14 +9180,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s27, s43
 ; GFX9-NEXT:    s_mov_b32 s28, s44
 ; GFX9-NEXT:    s_mov_b32 s29, s45
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v32i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v32i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
@@ -9231,7 +9227,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 28
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
@@ -9250,30 +9245,41 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v32i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v32i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
+; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
+; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
+; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
+; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
+; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
+; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
+; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[30:31], 0x40
+; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s46
-; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s47
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s49
 ; GFX10-NEXT:    s_mov_b32 s20, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
 ; GFX10-NEXT:    s_mov_b32 s21, s37
 ; GFX10-NEXT:    s_mov_b32 s22, s38
 ; GFX10-NEXT:    s_mov_b32 s23, s39
 ; GFX10-NEXT:    s_mov_b32 s24, s40
-; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX10-NEXT:    s_mov_b32 s25, s41
+; GFX10-NEXT:    s_mov_b32 s26, s42
+; GFX10-NEXT:    s_mov_b32 s27, s43
+; GFX10-NEXT:    s_mov_b32 s28, s44
+; GFX10-NEXT:    s_mov_b32 s29, s45
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s51
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
@@ -9282,19 +9288,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
 ; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
 ; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
-; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-NEXT:    s_mov_b32 s26, s42
-; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX10-NEXT:    s_mov_b32 s27, s43
-; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX10-NEXT:    s_mov_b32 s28, s44
-; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX10-NEXT:    s_mov_b32 s29, s45
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
@@ -9397,8 +9393,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
@@ -9462,39 +9458,43 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s52, s[34:35], 0x0
-; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
-; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
-; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
 ; GFX9-NEXT:    v_writelane_b32 v40, s25, 21
+; GFX9-NEXT:    v_writelane_b32 v40, s26, 22
+; GFX9-NEXT:    v_writelane_b32 v40, s27, 23
+; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
+; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s52
-; GFX9-NEXT:    v_writelane_b32 v40, s27, 23
+; GFX9-NEXT:    s_load_dword s34, s[30:31], 0x0
+; GFX9-NEXT:    ; kill: killed $sgpr30_sgpr31
+; GFX9-NEXT:    ; kill: killed $sgpr30_sgpr31
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[30:31], 0x40
+; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s34
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s46
-; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s47
-; GFX9-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s48
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s49
-; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s50
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s51
 ; GFX9-NEXT:    s_mov_b32 s20, s36
@@ -9507,14 +9507,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s27, s43
 ; GFX9-NEXT:    s_mov_b32 s28, s44
 ; GFX9-NEXT:    s_mov_b32 s29, s45
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v32i32_i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v32i32_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
@@ -9558,7 +9554,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 28
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
@@ -9577,56 +9572,57 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
+; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
+; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
+; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
+; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
+; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
+; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
+; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
+; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
+; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dword s52, s[34:35], 0x0
+; GFX10-NEXT:    s_load_dword s34, s[30:31], 0x0
 ; GFX10-NEXT:    ; meta instruction
 ; GFX10-NEXT:    ; meta instruction
-; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_v32i32_i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_v32i32_i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
-; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
-; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[30:31], 0x40
+; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s52
+; GFX10-NEXT:    v_mov_b32_e32 v0, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s47
-; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
+; GFX10-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s46
-; GFX10-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s49
-; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
 ; GFX10-NEXT:    s_mov_b32 s20, s36
 ; GFX10-NEXT:    s_mov_b32 s21, s37
 ; GFX10-NEXT:    s_mov_b32 s22, s38
 ; GFX10-NEXT:    s_mov_b32 s23, s39
-; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX10-NEXT:    s_mov_b32 s24, s40
 ; GFX10-NEXT:    s_mov_b32 s25, s41
+; GFX10-NEXT:    s_mov_b32 s26, s42
+; GFX10-NEXT:    s_mov_b32 s27, s43
+; GFX10-NEXT:    s_mov_b32 s28, s44
+; GFX10-NEXT:    s_mov_b32 s29, s45
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s51
-; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-NEXT:    s_mov_b32 s26, s42
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
 ; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
 ; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
-; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX10-NEXT:    s_mov_b32 s27, s43
-; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX10-NEXT:    s_mov_b32 s28, s44
-; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX10-NEXT:    s_mov_b32 s29, s45
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
@@ -9734,8 +9730,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
@@ -9791,16 +9787,16 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, stack_passed_f64_arg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, stack_passed_f64_arg at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, stack_passed_f64_arg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, stack_passed_f64_arg at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -9824,17 +9820,17 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, stack_passed_f64_arg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, stack_passed_f64_arg at rel32@hi+12
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, stack_passed_f64_arg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, stack_passed_f64_arg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -9864,8 +9860,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -9931,12 +9927,12 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 11
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_12xv3i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_12xv3i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_12xv3i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_12xv3i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -9998,12 +9994,12 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 11
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_12xv3i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_12xv3i32 at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_12xv3i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_12xv3i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -10067,8 +10063,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -10154,12 +10150,12 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 7
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_8xv5i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_8xv5i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_8xv5i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_8xv5i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -10229,12 +10225,12 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 7
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_8xv5i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_8xv5i32 at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_8xv5i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_8xv5i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -10303,8 +10299,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
@@ -10386,12 +10382,12 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0x40e00000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_8xv5f32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_8xv5f32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_8xv5f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_8xv5f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -10461,12 +10457,12 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 0x40e00000
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_8xv5f32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_8xv5f32 at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_8xv5f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_8xv5f32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -10535,8 +10531,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5f32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 26bc62d0a99b1..fa8f51cf412fa 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -25,8 +25,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
@@ -59,8 +59,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -77,54 +77,26 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
   ret void
 }
 
-define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
-; GFX9-LABEL: void_func_void_clobber_s28_s29:
+define amdgpu_gfx void @void_func_void_clobber_s30_s31() #1 {
+; GFX9-LABEL: void_func_void_clobber_s30_s31:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    v_writelane_b32 v0, s28, 0
-; GFX9-NEXT:    v_writelane_b32 v0, s29, 1
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; clobber
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s29, v0, 1
-; GFX9-NEXT:    v_readlane_b32 s28, v0, 0
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
-; GFX10-LABEL: void_func_void_clobber_s28_s29:
+; GFX10-LABEL: void_func_void_clobber_s30_s31:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    v_writelane_b32 v0, s28, 0
-; GFX10-NEXT:    v_writelane_b32 v0, s29, 1
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; clobber
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s29, v0, 1
-; GFX10-NEXT:    v_readlane_b32 s28, v0, 0
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-; GCN:          v_writelane_b32 v0, s28, 0
-; GCN:          v_writelane_b32 v0, s29, 1
-
-; GCN:          v_readlane_b32 s28, v0, 0
-; GCN:          v_readlane_b32 s29, v0, 1
-  call void asm sideeffect "; clobber", "~{s[28:29]}"() #0
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
+  call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
   ret void
 }
 
@@ -137,24 +109,24 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s31
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_mov_b32 s4, s31
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    s_mov_b32 s31, s4
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s31
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -190,8 +162,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
@@ -224,17 +196,17 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
 ; GFX9-NEXT:    ; def v31
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v31
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v41
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v31
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -260,18 +232,18 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, v31
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_mov_b32_e32 v31, v41
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use v31
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -304,16 +276,16 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
 ; GFX9-NEXT:    ; def s33
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_mov_b32 s4, s33
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s33
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -334,9 +306,6 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s33
 ; GFX10-NEXT:    ;;#ASMEND
@@ -344,13 +313,16 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
 ; GFX10-NEXT:    s_mov_b32 s4, s33
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    s_mov_b32 s33, s4
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s33
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
@@ -378,21 +350,21 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s34
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_mov_b32 s4, s34
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s34, s4
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s34
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -413,9 +385,6 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[36:37]
-; GFX10-NEXT:    s_add_u32 s36, s36, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s37, s37, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s34
 ; GFX10-NEXT:    ;;#ASMEND
@@ -423,13 +392,16 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
 ; GFX10-NEXT:    s_mov_b32 s4, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    s_mov_b32 s34, s4
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s34
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
@@ -461,16 +433,16 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def v40
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v40
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v41, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -495,17 +467,17 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
 ; GFX10-NEXT:    ; def v40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use v40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v41, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -616,12 +588,12 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, void_func_void_clobber_s33 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, void_func_void_clobber_s33 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, void_func_void_clobber_s33 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, void_func_void_clobber_s33 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -641,14 +613,14 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, void_func_void_clobber_s33 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, void_func_void_clobber_s33 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, void_func_void_clobber_s33 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, void_func_void_clobber_s33 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -673,12 +645,12 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, void_func_void_clobber_s34 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, void_func_void_clobber_s34 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, void_func_void_clobber_s34 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, void_func_void_clobber_s34 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -698,14 +670,14 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, void_func_void_clobber_s34 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, void_func_void_clobber_s34 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, void_func_void_clobber_s34 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, void_func_void_clobber_s34 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
@@ -735,15 +707,15 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX9-NEXT:    ; def s40
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_mov_b32 s4, s40
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s4
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -764,9 +736,6 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s40
 ; GFX10-NEXT:    ;;#ASMEND
@@ -774,12 +743,15 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX10-NEXT:    s_mov_b32 s4, s40
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s4
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
@@ -817,10 +789,10 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    ; def v32
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v32
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s4
 ; GFX9-NEXT:    ;;#ASMEND
@@ -828,8 +800,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    ; use v41
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
@@ -860,12 +832,12 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    ; def v32
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_mov_b32_e32 v41, v32
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s4
 ; GFX10-NEXT:    ;;#ASMEND
@@ -873,8 +845,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    ; use v41
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 1f22431168127..1b84b0a45458b 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -23,59 +23,27 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX9-LABEL: call_i1:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    v_writelane_b32 v1, s33, 2
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, return_i1 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, return_i1 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v1, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v1, s31, 1
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, return_i1 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, return_i1 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v1, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
 ; GFX10-LABEL: call_i1:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    v_writelane_b32 v1, s33, 2
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, return_i1 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, return_i1 at gotpcrel32@hi+12
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX10-NEXT:    v_writelane_b32 v1, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v1, s31, 1
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, return_i1 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, return_i1 at gotpcrel32@hi+12
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v1, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
 entry:
   call amdgpu_gfx i1 @return_i1()
   ret void
@@ -102,59 +70,27 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX9-LABEL: call_i16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    v_writelane_b32 v1, s33, 2
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, return_i16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, return_i16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v1, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v1, s31, 1
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, return_i16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, return_i16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v1, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
 ; GFX10-LABEL: call_i16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    v_writelane_b32 v1, s33, 2
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, return_i16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, return_i16 at gotpcrel32@hi+12
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX10-NEXT:    v_writelane_b32 v1, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v1, s31, 1
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, return_i16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, return_i16 at gotpcrel32@hi+12
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v1, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
 entry:
   call amdgpu_gfx i16 @return_i16()
   ret void
@@ -181,59 +117,27 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX9-LABEL: call_2xi16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    v_writelane_b32 v1, s33, 2
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, return_2xi16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, return_2xi16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v1, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v1, s31, 1
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, return_2xi16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, return_2xi16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v1, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
 ; GFX10-LABEL: call_2xi16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    v_writelane_b32 v1, s33, 2
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, return_2xi16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, return_2xi16 at gotpcrel32@hi+12
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX10-NEXT:    v_writelane_b32 v1, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v1, s31, 1
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, return_2xi16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, return_2xi16 at gotpcrel32@hi+12
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v1, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
 entry:
   call amdgpu_gfx <2 x i16> @return_2xi16()
   ret void
@@ -262,59 +166,27 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX9-LABEL: call_3xi16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    v_writelane_b32 v2, s33, 2
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, return_3xi16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, return_3xi16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, return_3xi16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, return_3xi16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v2, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
 ; GFX10-LABEL: call_3xi16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    v_writelane_b32 v2, s33, 2
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, return_3xi16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, return_3xi16 at gotpcrel32@hi+12
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, return_3xi16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, return_3xi16 at gotpcrel32@hi+12
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v2, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
 entry:
   call amdgpu_gfx <3 x i16> @return_3xi16()
   ret void
@@ -1369,63 +1241,41 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ; GFX9-LABEL: call_512xi32:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    v_writelane_b32 v2, s33, 2
+; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x1ffc0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffe0000
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x60000
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, return_512xi32 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, return_512xi32 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, return_512xi32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, return_512xi32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
-; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0xfffa0000
-; GFX9-NEXT:    v_readlane_b32 s33, v2, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
 ; GFX10-LABEL: call_512xi32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    v_writelane_b32 v2, s33, 2
+; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0xffe0
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x30000
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xffff0000
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, return_512xi32 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, return_512xi32 at gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, return_512xi32 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, return_512xi32 at gotpcrel32@hi+12
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0xfffd0000
-; GFX10-NEXT:    v_readlane_b32 s33, v2, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:2048 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
 entry:
   call amdgpu_gfx <512 x i32> @return_512xi32()
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index a1a6a805f3678..a47606e147454 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -399,23 +399,23 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 17
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s46, 13
-; GCN-NEXT:    v_writelane_b32 v40, s47, 14
-; GCN-NEXT:    v_writelane_b32 v40, s48, 15
-; GCN-NEXT:    v_writelane_b32 v40, s49, 16
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s30, 15
+; GCN-NEXT:    v_writelane_b32 v40, s31, 16
 ; GCN-NEXT:    s_mov_b32 s42, s14
 ; GCN-NEXT:    s_mov_b32 s43, s13
 ; GCN-NEXT:    s_mov_b32 s44, s12
@@ -443,30 +443,30 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB2_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s49, v40, 16
-; GCN-NEXT:    v_readlane_b32 s48, v40, 15
-; GCN-NEXT:    v_readlane_b32 s47, v40, 14
-; GCN-NEXT:    v_readlane_b32 s46, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s4, v40, 15
+; GCN-NEXT:    v_readlane_b32 s5, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    v_readlane_b32 s33, v40, 17
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr:
 ; GISEL:       ; %bb.0:
@@ -477,23 +477,21 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s33, 17
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 14
 ; GISEL-NEXT:    s_mov_b32 s42, s14
 ; GISEL-NEXT:    s_mov_b32 s43, s13
 ; GISEL-NEXT:    s_mov_b32 s44, s12
@@ -501,6 +499,8 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
 ; GISEL-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GISEL-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; GISEL-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 16
 ; GISEL-NEXT:    s_mov_b64 s[46:47], exec
 ; GISEL-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s16, v0
@@ -521,30 +521,30 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB2_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[46:47]
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s4, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s5, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    v_readlane_b32 s33, v40, 17
-; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-NEXT:    s_setpc_b64 s[4:5]
   call void %fptr()
   ret void
 }
@@ -559,23 +559,23 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 17
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s46, 13
-; GCN-NEXT:    v_writelane_b32 v40, s47, 14
-; GCN-NEXT:    v_writelane_b32 v40, s48, 15
-; GCN-NEXT:    v_writelane_b32 v40, s49, 16
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s30, 15
+; GCN-NEXT:    v_writelane_b32 v40, s31, 16
 ; GCN-NEXT:    s_mov_b32 s42, s14
 ; GCN-NEXT:    s_mov_b32 s43, s13
 ; GCN-NEXT:    s_mov_b32 s44, s12
@@ -606,30 +606,30 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB3_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s49, v40, 16
-; GCN-NEXT:    v_readlane_b32 s48, v40, 15
-; GCN-NEXT:    v_readlane_b32 s47, v40, 14
-; GCN-NEXT:    v_readlane_b32 s46, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s4, v40, 15
+; GCN-NEXT:    v_readlane_b32 s5, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    v_readlane_b32 s33, v40, 17
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg:
 ; GISEL:       ; %bb.0:
@@ -640,23 +640,21 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s33, 17
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 14
 ; GISEL-NEXT:    s_mov_b32 s42, s14
 ; GISEL-NEXT:    s_mov_b32 s43, s13
 ; GISEL-NEXT:    s_mov_b32 s44, s12
@@ -664,6 +662,8 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
 ; GISEL-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GISEL-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; GISEL-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 16
 ; GISEL-NEXT:    s_mov_b64 s[46:47], exec
 ; GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s16, v0
@@ -685,30 +685,30 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB3_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[46:47]
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s4, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s5, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    v_readlane_b32 s33, v40, 17
-; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-NEXT:    s_setpc_b64 s[4:5]
   call void %fptr(i32 123)
   ret void
 }
@@ -723,23 +723,23 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 17
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s46, 13
-; GCN-NEXT:    v_writelane_b32 v40, s47, 14
-; GCN-NEXT:    v_writelane_b32 v40, s48, 15
-; GCN-NEXT:    v_writelane_b32 v40, s49, 16
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s30, 15
+; GCN-NEXT:    v_writelane_b32 v40, s31, 16
 ; GCN-NEXT:    s_mov_b32 s42, s14
 ; GCN-NEXT:    s_mov_b32 s43, s13
 ; GCN-NEXT:    s_mov_b32 s44, s12
@@ -769,30 +769,30 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[46:47]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
-; GCN-NEXT:    v_readlane_b32 s49, v40, 16
-; GCN-NEXT:    v_readlane_b32 s48, v40, 15
-; GCN-NEXT:    v_readlane_b32 s47, v40, 14
-; GCN-NEXT:    v_readlane_b32 s46, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s4, v40, 15
+; GCN-NEXT:    v_readlane_b32 s5, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    v_readlane_b32 s33, v40, 17
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_ret:
 ; GISEL:       ; %bb.0:
@@ -803,23 +803,21 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s33, 17
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 14
 ; GISEL-NEXT:    s_mov_b32 s42, s14
 ; GISEL-NEXT:    s_mov_b32 s43, s13
 ; GISEL-NEXT:    s_mov_b32 s44, s12
@@ -827,6 +825,8 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
 ; GISEL-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GISEL-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; GISEL-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 16
 ; GISEL-NEXT:    s_mov_b64 s[46:47], exec
 ; GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s16, v0
@@ -849,30 +849,30 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[46:47]
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s4, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s5, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    v_readlane_b32 s33, v40, 17
-; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-NEXT:    s_setpc_b64 s[4:5]
   %a = call i32 %fptr()
   %b = add i32 %a, 1
   ret i32 %b
@@ -888,25 +888,23 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 19
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s46, 13
-; GCN-NEXT:    v_writelane_b32 v40, s47, 14
-; GCN-NEXT:    v_writelane_b32 v40, s48, 15
-; GCN-NEXT:    v_writelane_b32 v40, s49, 16
-; GCN-NEXT:    v_writelane_b32 v40, s50, 17
-; GCN-NEXT:    v_writelane_b32 v40, s51, 18
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s50, 15
+; GCN-NEXT:    v_writelane_b32 v40, s51, 16
 ; GCN-NEXT:    s_mov_b32 s42, s14
 ; GCN-NEXT:    s_mov_b32 s43, s13
 ; GCN-NEXT:    s_mov_b32 s44, s12
@@ -919,6 +917,8 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GCN-NEXT:    s_and_saveexec_b64 s[46:47], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB5_4
 ; GCN-NEXT:  ; %bb.1: ; %bb1
+; GCN-NEXT:    v_writelane_b32 v40, s30, 17
+; GCN-NEXT:    v_writelane_b32 v40, s31, 18
 ; GCN-NEXT:    s_mov_b64 s[48:49], exec
 ; GCN-NEXT:  .LBB5_2: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v0
@@ -939,27 +939,27 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GCN-NEXT:    s_cbranch_execnz .LBB5_2
 ; GCN-NEXT:  ; %bb.3:
 ; GCN-NEXT:    s_mov_b64 exec, s[48:49]
+; GCN-NEXT:    v_readlane_b32 s30, v40, 17
+; GCN-NEXT:    v_readlane_b32 s31, v40, 18
 ; GCN-NEXT:  .LBB5_4: ; %bb2
 ; GCN-NEXT:    s_or_b64 exec, exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s51, v40, 18
-; GCN-NEXT:    v_readlane_b32 s50, v40, 17
-; GCN-NEXT:    v_readlane_b32 s49, v40, 16
-; GCN-NEXT:    v_readlane_b32 s48, v40, 15
-; GCN-NEXT:    v_readlane_b32 s47, v40, 14
-; GCN-NEXT:    v_readlane_b32 s46, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s51, v40, 16
+; GCN-NEXT:    v_readlane_b32 s50, v40, 15
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    v_readlane_b32 s33, v40, 19
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
@@ -977,25 +977,23 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s33, 19
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 16
 ; GISEL-NEXT:    s_mov_b32 s42, s14
 ; GISEL-NEXT:    s_mov_b32 s43, s13
 ; GISEL-NEXT:    s_mov_b32 s44, s12
@@ -1008,6 +1006,8 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GISEL-NEXT:    s_and_saveexec_b64 s[46:47], vcc
 ; GISEL-NEXT:    s_cbranch_execz .LBB5_4
 ; GISEL-NEXT:  ; %bb.1: ; %bb1
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 18
 ; GISEL-NEXT:    s_mov_b64 s[48:49], exec
 ; GISEL-NEXT:  .LBB5_2: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s16, v0
@@ -1028,27 +1028,27 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB5_2
 ; GISEL-NEXT:  ; %bb.3:
 ; GISEL-NEXT:    s_mov_b64 exec, s[48:49]
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 18
 ; GISEL-NEXT:  .LBB5_4: ; %bb2
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[46:47]
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    v_readlane_b32 s33, v40, 19
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
@@ -1074,93 +1074,90 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v40, s33, 32
+; GCN-NEXT:    v_writelane_b32 v40, s33, 30
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    v_writelane_b32 v40, s52, 20
-; GCN-NEXT:    v_writelane_b32 v40, s53, 21
-; GCN-NEXT:    v_writelane_b32 v40, s54, 22
-; GCN-NEXT:    v_writelane_b32 v40, s55, 23
-; GCN-NEXT:    v_writelane_b32 v40, s56, 24
-; GCN-NEXT:    v_writelane_b32 v40, s57, 25
-; GCN-NEXT:    v_writelane_b32 v40, s58, 26
-; GCN-NEXT:    v_writelane_b32 v40, s59, 27
-; GCN-NEXT:    v_writelane_b32 v40, s60, 28
-; GCN-NEXT:    v_writelane_b32 v40, s61, 29
-; GCN-NEXT:    v_writelane_b32 v40, s62, 30
-; GCN-NEXT:    v_writelane_b32 v40, s63, 31
-; GCN-NEXT:    s_mov_b64 s[6:7], exec
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s45, 11
+; GCN-NEXT:    v_writelane_b32 v40, s46, 12
+; GCN-NEXT:    v_writelane_b32 v40, s47, 13
+; GCN-NEXT:    v_writelane_b32 v40, s48, 14
+; GCN-NEXT:    v_writelane_b32 v40, s49, 15
+; GCN-NEXT:    v_writelane_b32 v40, s50, 16
+; GCN-NEXT:    v_writelane_b32 v40, s51, 17
+; GCN-NEXT:    v_writelane_b32 v40, s52, 18
+; GCN-NEXT:    v_writelane_b32 v40, s53, 19
+; GCN-NEXT:    v_writelane_b32 v40, s54, 20
+; GCN-NEXT:    v_writelane_b32 v40, s55, 21
+; GCN-NEXT:    v_writelane_b32 v40, s56, 22
+; GCN-NEXT:    v_writelane_b32 v40, s57, 23
+; GCN-NEXT:    v_writelane_b32 v40, s58, 24
+; GCN-NEXT:    v_writelane_b32 v40, s59, 25
+; GCN-NEXT:    v_writelane_b32 v40, s60, 26
+; GCN-NEXT:    v_writelane_b32 v40, s61, 27
+; GCN-NEXT:    v_writelane_b32 v40, s62, 28
+; GCN-NEXT:    v_writelane_b32 v40, s63, 29
+; GCN-NEXT:    s_mov_b64 s[6:7], s[30:31]
+; GCN-NEXT:    s_mov_b64 s[8:9], exec
 ; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    v_readfirstlane_b32 s11, v1
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
-; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[12:13], v[0:1]
+; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN-NEXT:    s_xor_b64 exec, exec, s[8:9]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[10:11]
 ; GCN-NEXT:    s_cbranch_execnz .LBB6_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    v_readlane_b32 s63, v40, 31
-; GCN-NEXT:    v_readlane_b32 s62, v40, 30
-; GCN-NEXT:    v_readlane_b32 s61, v40, 29
-; GCN-NEXT:    v_readlane_b32 s60, v40, 28
-; GCN-NEXT:    v_readlane_b32 s59, v40, 27
-; GCN-NEXT:    v_readlane_b32 s58, v40, 26
-; GCN-NEXT:    v_readlane_b32 s57, v40, 25
-; GCN-NEXT:    v_readlane_b32 s56, v40, 24
-; GCN-NEXT:    v_readlane_b32 s55, v40, 23
-; GCN-NEXT:    v_readlane_b32 s54, v40, 22
-; GCN-NEXT:    v_readlane_b32 s53, v40, 21
-; GCN-NEXT:    v_readlane_b32 s52, v40, 20
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_readlane_b32 s63, v40, 29
+; GCN-NEXT:    v_readlane_b32 s62, v40, 28
+; GCN-NEXT:    v_readlane_b32 s61, v40, 27
+; GCN-NEXT:    v_readlane_b32 s60, v40, 26
+; GCN-NEXT:    v_readlane_b32 s59, v40, 25
+; GCN-NEXT:    v_readlane_b32 s58, v40, 24
+; GCN-NEXT:    v_readlane_b32 s57, v40, 23
+; GCN-NEXT:    v_readlane_b32 s56, v40, 22
+; GCN-NEXT:    v_readlane_b32 s55, v40, 21
+; GCN-NEXT:    v_readlane_b32 s54, v40, 20
+; GCN-NEXT:    v_readlane_b32 s53, v40, 19
+; GCN-NEXT:    v_readlane_b32 s52, v40, 18
+; GCN-NEXT:    v_readlane_b32 s51, v40, 17
+; GCN-NEXT:    v_readlane_b32 s50, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 15
+; GCN-NEXT:    v_readlane_b32 s48, v40, 14
+; GCN-NEXT:    v_readlane_b32 s47, v40, 13
+; GCN-NEXT:    v_readlane_b32 s46, v40, 12
+; GCN-NEXT:    v_readlane_b32 s45, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v40, 32
+; GCN-NEXT:    v_readlane_b32 s33, v40, 30
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[6:7]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
 ; GISEL:       ; %bb.0:
@@ -1168,93 +1165,90 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_writelane_b32 v40, s33, 32
+; GISEL-NEXT:    v_writelane_b32 v40, s33, 30
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 19
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 20
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 21
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 22
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 23
-; GISEL-NEXT:    v_writelane_b32 v40, s56, 24
-; GISEL-NEXT:    v_writelane_b32 v40, s57, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s58, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s59, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s60, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s61, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s45, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 19
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 20
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 21
+; GISEL-NEXT:    v_writelane_b32 v40, s56, 22
+; GISEL-NEXT:    v_writelane_b32 v40, s57, 23
+; GISEL-NEXT:    v_writelane_b32 v40, s58, 24
+; GISEL-NEXT:    v_writelane_b32 v40, s59, 25
+; GISEL-NEXT:    v_writelane_b32 v40, s60, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s61, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s62, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s63, 29
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[30:31]
 ; GISEL-NEXT:    s_movk_i32 s4, 0x7b
-; GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GISEL-NEXT:    s_mov_b64 s[8:9], exec
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
-; GISEL-NEXT:    v_readfirstlane_b32 s9, v1
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT:    v_readfirstlane_b32 s10, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s11, v1
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB6_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
-; GISEL-NEXT:    v_readlane_b32 s63, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s62, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s61, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s60, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s59, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s58, v40, 26
-; GISEL-NEXT:    v_readlane_b32 s57, v40, 25
-; GISEL-NEXT:    v_readlane_b32 s56, v40, 24
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 23
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 22
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 21
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 20
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    s_mov_b64 exec, s[8:9]
+; GISEL-NEXT:    v_readlane_b32 s63, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s62, v40, 28
+; GISEL-NEXT:    v_readlane_b32 s61, v40, 27
+; GISEL-NEXT:    v_readlane_b32 s60, v40, 26
+; GISEL-NEXT:    v_readlane_b32 s59, v40, 25
+; GISEL-NEXT:    v_readlane_b32 s58, v40, 24
+; GISEL-NEXT:    v_readlane_b32 s57, v40, 23
+; GISEL-NEXT:    v_readlane_b32 s56, v40, 22
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 21
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 20
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s45, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    v_readlane_b32 s33, v40, 32
+; GISEL-NEXT:    v_readlane_b32 s33, v40, 30
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-NEXT:    s_setpc_b64 s[6:7]
   call amdgpu_gfx void %fptr(i32 inreg 123)
   ret void
 }
@@ -1266,97 +1260,94 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v40, s33, 32
+; GCN-NEXT:    v_writelane_b32 v40, s33, 30
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    v_writelane_b32 v40, s52, 20
-; GCN-NEXT:    v_writelane_b32 v40, s53, 21
-; GCN-NEXT:    v_writelane_b32 v40, s54, 22
-; GCN-NEXT:    v_writelane_b32 v40, s55, 23
-; GCN-NEXT:    v_writelane_b32 v40, s56, 24
-; GCN-NEXT:    v_writelane_b32 v40, s57, 25
-; GCN-NEXT:    v_writelane_b32 v40, s58, 26
-; GCN-NEXT:    v_writelane_b32 v40, s59, 27
-; GCN-NEXT:    v_writelane_b32 v40, s60, 28
-; GCN-NEXT:    v_writelane_b32 v40, s61, 29
-; GCN-NEXT:    v_writelane_b32 v40, s62, 30
-; GCN-NEXT:    v_writelane_b32 v40, s63, 31
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s45, 11
+; GCN-NEXT:    v_writelane_b32 v40, s46, 12
+; GCN-NEXT:    v_writelane_b32 v40, s47, 13
+; GCN-NEXT:    v_writelane_b32 v40, s48, 14
+; GCN-NEXT:    v_writelane_b32 v40, s49, 15
+; GCN-NEXT:    v_writelane_b32 v40, s50, 16
+; GCN-NEXT:    v_writelane_b32 v40, s51, 17
+; GCN-NEXT:    v_writelane_b32 v40, s52, 18
+; GCN-NEXT:    v_writelane_b32 v40, s53, 19
+; GCN-NEXT:    v_writelane_b32 v40, s54, 20
+; GCN-NEXT:    v_writelane_b32 v40, s55, 21
+; GCN-NEXT:    v_writelane_b32 v40, s56, 22
+; GCN-NEXT:    v_writelane_b32 v40, s57, 23
+; GCN-NEXT:    v_writelane_b32 v40, s58, 24
+; GCN-NEXT:    v_writelane_b32 v40, s59, 25
+; GCN-NEXT:    v_writelane_b32 v40, s60, 26
+; GCN-NEXT:    v_writelane_b32 v40, s61, 27
+; GCN-NEXT:    v_writelane_b32 v40, s62, 28
+; GCN-NEXT:    v_writelane_b32 v40, s63, 29
+; GCN-NEXT:    s_mov_b64 s[4:5], s[30:31]
 ; GCN-NEXT:    v_mov_b32_e32 v41, v0
-; GCN-NEXT:    s_mov_b64 s[4:5], exec
+; GCN-NEXT:    s_mov_b64 s[6:7], exec
 ; GCN-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v1
-; GCN-NEXT:    v_readfirstlane_b32 s9, v2
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[1:2]
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GCN-NEXT:    v_mov_b32_e32 v0, v41
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GCN-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execnz .LBB7_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v41
-; GCN-NEXT:    v_readlane_b32 s63, v40, 31
-; GCN-NEXT:    v_readlane_b32 s62, v40, 30
-; GCN-NEXT:    v_readlane_b32 s61, v40, 29
-; GCN-NEXT:    v_readlane_b32 s60, v40, 28
-; GCN-NEXT:    v_readlane_b32 s59, v40, 27
-; GCN-NEXT:    v_readlane_b32 s58, v40, 26
-; GCN-NEXT:    v_readlane_b32 s57, v40, 25
-; GCN-NEXT:    v_readlane_b32 s56, v40, 24
-; GCN-NEXT:    v_readlane_b32 s55, v40, 23
-; GCN-NEXT:    v_readlane_b32 s54, v40, 22
-; GCN-NEXT:    v_readlane_b32 s53, v40, 21
-; GCN-NEXT:    v_readlane_b32 s52, v40, 20
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s63, v40, 29
+; GCN-NEXT:    v_readlane_b32 s62, v40, 28
+; GCN-NEXT:    v_readlane_b32 s61, v40, 27
+; GCN-NEXT:    v_readlane_b32 s60, v40, 26
+; GCN-NEXT:    v_readlane_b32 s59, v40, 25
+; GCN-NEXT:    v_readlane_b32 s58, v40, 24
+; GCN-NEXT:    v_readlane_b32 s57, v40, 23
+; GCN-NEXT:    v_readlane_b32 s56, v40, 22
+; GCN-NEXT:    v_readlane_b32 s55, v40, 21
+; GCN-NEXT:    v_readlane_b32 s54, v40, 20
+; GCN-NEXT:    v_readlane_b32 s53, v40, 19
+; GCN-NEXT:    v_readlane_b32 s52, v40, 18
+; GCN-NEXT:    v_readlane_b32 s51, v40, 17
+; GCN-NEXT:    v_readlane_b32 s50, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 15
+; GCN-NEXT:    v_readlane_b32 s48, v40, 14
+; GCN-NEXT:    v_readlane_b32 s47, v40, 13
+; GCN-NEXT:    v_readlane_b32 s46, v40, 12
+; GCN-NEXT:    v_readlane_b32 s45, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v40, 32
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_readlane_b32 s33, v40, 30
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
 ; GISEL:       ; %bb.0:
@@ -1364,97 +1355,94 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_writelane_b32 v40, s33, 32
+; GISEL-NEXT:    v_writelane_b32 v40, s33, 30
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 19
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 20
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 21
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 22
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 23
-; GISEL-NEXT:    v_writelane_b32 v40, s56, 24
-; GISEL-NEXT:    v_writelane_b32 v40, s57, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s58, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s59, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s60, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s61, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s45, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 19
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 20
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 21
+; GISEL-NEXT:    v_writelane_b32 v40, s56, 22
+; GISEL-NEXT:    v_writelane_b32 v40, s57, 23
+; GISEL-NEXT:    v_writelane_b32 v40, s58, 24
+; GISEL-NEXT:    v_writelane_b32 v40, s59, 25
+; GISEL-NEXT:    v_writelane_b32 v40, s60, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s61, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s62, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s63, 29
 ; GISEL-NEXT:    v_mov_b32_e32 v41, v0
-; GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GISEL-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    v_readfirstlane_b32 s6, v1
-; GISEL-NEXT:    v_readfirstlane_b32 s7, v2
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2]
-; GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GISEL-NEXT:    v_readfirstlane_b32 s8, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s9, v2
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v41
-; GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB7_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v41
-; GISEL-NEXT:    v_readlane_b32 s63, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s62, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s61, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s60, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s59, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s58, v40, 26
-; GISEL-NEXT:    v_readlane_b32 s57, v40, 25
-; GISEL-NEXT:    v_readlane_b32 s56, v40, 24
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 23
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 22
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 21
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 20
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s63, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s62, v40, 28
+; GISEL-NEXT:    v_readlane_b32 s61, v40, 27
+; GISEL-NEXT:    v_readlane_b32 s60, v40, 26
+; GISEL-NEXT:    v_readlane_b32 s59, v40, 25
+; GISEL-NEXT:    v_readlane_b32 s58, v40, 24
+; GISEL-NEXT:    v_readlane_b32 s57, v40, 23
+; GISEL-NEXT:    v_readlane_b32 s56, v40, 22
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 21
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 20
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s45, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    v_readlane_b32 s33, v40, 32
-; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GISEL-NEXT:    v_readlane_b32 s33, v40, 30
+; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-NEXT:    s_setpc_b64 s[4:5]
   call amdgpu_gfx void %fptr(i32 %i)
   ret i32 %i
 }
@@ -1470,95 +1458,92 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v40, s33, 32
+; GCN-NEXT:    v_writelane_b32 v40, s33, 30
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    v_writelane_b32 v40, s52, 20
-; GCN-NEXT:    v_writelane_b32 v40, s53, 21
-; GCN-NEXT:    v_writelane_b32 v40, s54, 22
-; GCN-NEXT:    v_writelane_b32 v40, s55, 23
-; GCN-NEXT:    v_writelane_b32 v40, s56, 24
-; GCN-NEXT:    v_writelane_b32 v40, s57, 25
-; GCN-NEXT:    v_writelane_b32 v40, s58, 26
-; GCN-NEXT:    v_writelane_b32 v40, s59, 27
-; GCN-NEXT:    v_writelane_b32 v40, s60, 28
-; GCN-NEXT:    v_writelane_b32 v40, s61, 29
-; GCN-NEXT:    v_writelane_b32 v40, s62, 30
-; GCN-NEXT:    v_writelane_b32 v40, s63, 31
-; GCN-NEXT:    s_mov_b64 s[4:5], exec
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s45, 11
+; GCN-NEXT:    v_writelane_b32 v40, s46, 12
+; GCN-NEXT:    v_writelane_b32 v40, s47, 13
+; GCN-NEXT:    v_writelane_b32 v40, s48, 14
+; GCN-NEXT:    v_writelane_b32 v40, s49, 15
+; GCN-NEXT:    v_writelane_b32 v40, s50, 16
+; GCN-NEXT:    v_writelane_b32 v40, s51, 17
+; GCN-NEXT:    v_writelane_b32 v40, s52, 18
+; GCN-NEXT:    v_writelane_b32 v40, s53, 19
+; GCN-NEXT:    v_writelane_b32 v40, s54, 20
+; GCN-NEXT:    v_writelane_b32 v40, s55, 21
+; GCN-NEXT:    v_writelane_b32 v40, s56, 22
+; GCN-NEXT:    v_writelane_b32 v40, s57, 23
+; GCN-NEXT:    v_writelane_b32 v40, s58, 24
+; GCN-NEXT:    v_writelane_b32 v40, s59, 25
+; GCN-NEXT:    v_writelane_b32 v40, s60, 26
+; GCN-NEXT:    v_writelane_b32 v40, s61, 27
+; GCN-NEXT:    v_writelane_b32 v40, s62, 28
+; GCN-NEXT:    v_writelane_b32 v40, s63, 29
+; GCN-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GCN-NEXT:    s_mov_b64 s[6:7], exec
 ; GCN-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v1
-; GCN-NEXT:    v_readfirstlane_b32 s9, v2
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[1:2]
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GCN-NEXT:    v_mov_b32_e32 v3, v0
 ; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execnz .LBB8_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
-; GCN-NEXT:    v_readlane_b32 s63, v40, 31
-; GCN-NEXT:    v_readlane_b32 s62, v40, 30
-; GCN-NEXT:    v_readlane_b32 s61, v40, 29
-; GCN-NEXT:    v_readlane_b32 s60, v40, 28
-; GCN-NEXT:    v_readlane_b32 s59, v40, 27
-; GCN-NEXT:    v_readlane_b32 s58, v40, 26
-; GCN-NEXT:    v_readlane_b32 s57, v40, 25
-; GCN-NEXT:    v_readlane_b32 s56, v40, 24
-; GCN-NEXT:    v_readlane_b32 s55, v40, 23
-; GCN-NEXT:    v_readlane_b32 s54, v40, 22
-; GCN-NEXT:    v_readlane_b32 s53, v40, 21
-; GCN-NEXT:    v_readlane_b32 s52, v40, 20
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s63, v40, 29
+; GCN-NEXT:    v_readlane_b32 s62, v40, 28
+; GCN-NEXT:    v_readlane_b32 s61, v40, 27
+; GCN-NEXT:    v_readlane_b32 s60, v40, 26
+; GCN-NEXT:    v_readlane_b32 s59, v40, 25
+; GCN-NEXT:    v_readlane_b32 s58, v40, 24
+; GCN-NEXT:    v_readlane_b32 s57, v40, 23
+; GCN-NEXT:    v_readlane_b32 s56, v40, 22
+; GCN-NEXT:    v_readlane_b32 s55, v40, 21
+; GCN-NEXT:    v_readlane_b32 s54, v40, 20
+; GCN-NEXT:    v_readlane_b32 s53, v40, 19
+; GCN-NEXT:    v_readlane_b32 s52, v40, 18
+; GCN-NEXT:    v_readlane_b32 s51, v40, 17
+; GCN-NEXT:    v_readlane_b32 s50, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 15
+; GCN-NEXT:    v_readlane_b32 s48, v40, 14
+; GCN-NEXT:    v_readlane_b32 s47, v40, 13
+; GCN-NEXT:    v_readlane_b32 s46, v40, 12
+; GCN-NEXT:    v_readlane_b32 s45, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v40, 32
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_readlane_b32 s33, v40, 30
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
 ; GISEL:       ; %bb.0:
@@ -1566,95 +1551,92 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_writelane_b32 v40, s33, 32
+; GISEL-NEXT:    v_writelane_b32 v40, s33, 30
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 19
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 20
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 21
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 22
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 23
-; GISEL-NEXT:    v_writelane_b32 v40, s56, 24
-; GISEL-NEXT:    v_writelane_b32 v40, s57, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s58, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s59, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s60, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s61, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
-; GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s45, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 19
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 20
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 21
+; GISEL-NEXT:    v_writelane_b32 v40, s56, 22
+; GISEL-NEXT:    v_writelane_b32 v40, s57, 23
+; GISEL-NEXT:    v_writelane_b32 v40, s58, 24
+; GISEL-NEXT:    v_writelane_b32 v40, s59, 25
+; GISEL-NEXT:    v_writelane_b32 v40, s60, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s61, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s62, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s63, 29
+; GISEL-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    v_readfirstlane_b32 s6, v1
-; GISEL-NEXT:    v_readfirstlane_b32 s7, v2
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2]
-; GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GISEL-NEXT:    v_readfirstlane_b32 s8, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s9, v2
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    v_mov_b32_e32 v3, v0
 ; GISEL-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB8_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_readlane_b32 s63, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s62, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s61, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s60, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s59, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s58, v40, 26
-; GISEL-NEXT:    v_readlane_b32 s57, v40, 25
-; GISEL-NEXT:    v_readlane_b32 s56, v40, 24
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 23
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 22
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 21
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 20
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s63, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s62, v40, 28
+; GISEL-NEXT:    v_readlane_b32 s61, v40, 27
+; GISEL-NEXT:    v_readlane_b32 s60, v40, 26
+; GISEL-NEXT:    v_readlane_b32 s59, v40, 25
+; GISEL-NEXT:    v_readlane_b32 s58, v40, 24
+; GISEL-NEXT:    v_readlane_b32 s57, v40, 23
+; GISEL-NEXT:    v_readlane_b32 s56, v40, 22
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 21
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 20
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s45, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    v_readlane_b32 s33, v40, 32
-; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GISEL-NEXT:    v_readlane_b32 s33, v40, 30
+; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-NEXT:    s_setpc_b64 s[4:5]
   %ret = call amdgpu_gfx i32 %fptr(i32 %i)
   ret i32 %ret
 }
@@ -1667,92 +1649,89 @@ define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v40, s33, 32
+; GCN-NEXT:    v_writelane_b32 v40, s33, 30
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    v_writelane_b32 v40, s52, 20
-; GCN-NEXT:    v_writelane_b32 v40, s53, 21
-; GCN-NEXT:    v_writelane_b32 v40, s54, 22
-; GCN-NEXT:    v_writelane_b32 v40, s55, 23
-; GCN-NEXT:    v_writelane_b32 v40, s56, 24
-; GCN-NEXT:    v_writelane_b32 v40, s57, 25
-; GCN-NEXT:    v_writelane_b32 v40, s58, 26
-; GCN-NEXT:    v_writelane_b32 v40, s59, 27
-; GCN-NEXT:    v_writelane_b32 v40, s60, 28
-; GCN-NEXT:    v_writelane_b32 v40, s61, 29
-; GCN-NEXT:    v_writelane_b32 v40, s62, 30
-; GCN-NEXT:    v_writelane_b32 v40, s63, 31
-; GCN-NEXT:    s_mov_b64 s[4:5], exec
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s45, 11
+; GCN-NEXT:    v_writelane_b32 v40, s46, 12
+; GCN-NEXT:    v_writelane_b32 v40, s47, 13
+; GCN-NEXT:    v_writelane_b32 v40, s48, 14
+; GCN-NEXT:    v_writelane_b32 v40, s49, 15
+; GCN-NEXT:    v_writelane_b32 v40, s50, 16
+; GCN-NEXT:    v_writelane_b32 v40, s51, 17
+; GCN-NEXT:    v_writelane_b32 v40, s52, 18
+; GCN-NEXT:    v_writelane_b32 v40, s53, 19
+; GCN-NEXT:    v_writelane_b32 v40, s54, 20
+; GCN-NEXT:    v_writelane_b32 v40, s55, 21
+; GCN-NEXT:    v_writelane_b32 v40, s56, 22
+; GCN-NEXT:    v_writelane_b32 v40, s57, 23
+; GCN-NEXT:    v_writelane_b32 v40, s58, 24
+; GCN-NEXT:    v_writelane_b32 v40, s59, 25
+; GCN-NEXT:    v_writelane_b32 v40, s60, 26
+; GCN-NEXT:    v_writelane_b32 v40, s61, 27
+; GCN-NEXT:    v_writelane_b32 v40, s62, 28
+; GCN-NEXT:    v_writelane_b32 v40, s63, 29
+; GCN-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GCN-NEXT:    s_mov_b64 s[6:7], exec
 ; GCN-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    v_readfirstlane_b32 s9, v1
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    v_readfirstlane_b32 s11, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execnz .LBB9_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_readlane_b32 s63, v40, 31
-; GCN-NEXT:    v_readlane_b32 s62, v40, 30
-; GCN-NEXT:    v_readlane_b32 s61, v40, 29
-; GCN-NEXT:    v_readlane_b32 s60, v40, 28
-; GCN-NEXT:    v_readlane_b32 s59, v40, 27
-; GCN-NEXT:    v_readlane_b32 s58, v40, 26
-; GCN-NEXT:    v_readlane_b32 s57, v40, 25
-; GCN-NEXT:    v_readlane_b32 s56, v40, 24
-; GCN-NEXT:    v_readlane_b32 s55, v40, 23
-; GCN-NEXT:    v_readlane_b32 s54, v40, 22
-; GCN-NEXT:    v_readlane_b32 s53, v40, 21
-; GCN-NEXT:    v_readlane_b32 s52, v40, 20
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    v_readlane_b32 s63, v40, 29
+; GCN-NEXT:    v_readlane_b32 s62, v40, 28
+; GCN-NEXT:    v_readlane_b32 s61, v40, 27
+; GCN-NEXT:    v_readlane_b32 s60, v40, 26
+; GCN-NEXT:    v_readlane_b32 s59, v40, 25
+; GCN-NEXT:    v_readlane_b32 s58, v40, 24
+; GCN-NEXT:    v_readlane_b32 s57, v40, 23
+; GCN-NEXT:    v_readlane_b32 s56, v40, 22
+; GCN-NEXT:    v_readlane_b32 s55, v40, 21
+; GCN-NEXT:    v_readlane_b32 s54, v40, 20
+; GCN-NEXT:    v_readlane_b32 s53, v40, 19
+; GCN-NEXT:    v_readlane_b32 s52, v40, 18
+; GCN-NEXT:    v_readlane_b32 s51, v40, 17
+; GCN-NEXT:    v_readlane_b32 s50, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 15
+; GCN-NEXT:    v_readlane_b32 s48, v40, 14
+; GCN-NEXT:    v_readlane_b32 s47, v40, 13
+; GCN-NEXT:    v_readlane_b32 s46, v40, 12
+; GCN-NEXT:    v_readlane_b32 s45, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v40, 32
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_readlane_b32 s33, v40, 30
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr:
 ; GISEL:       ; %bb.0:
@@ -1760,92 +1739,89 @@ define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_writelane_b32 v40, s33, 32
+; GISEL-NEXT:    v_writelane_b32 v40, s33, 30
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 19
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 20
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 21
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 22
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 23
-; GISEL-NEXT:    v_writelane_b32 v40, s56, 24
-; GISEL-NEXT:    v_writelane_b32 v40, s57, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s58, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s59, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s60, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s61, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
-; GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
+; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s45, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 19
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 20
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 21
+; GISEL-NEXT:    v_writelane_b32 v40, s56, 22
+; GISEL-NEXT:    v_writelane_b32 v40, s57, 23
+; GISEL-NEXT:    v_writelane_b32 v40, s58, 24
+; GISEL-NEXT:    v_writelane_b32 v40, s59, 25
+; GISEL-NEXT:    v_writelane_b32 v40, s60, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s61, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s62, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s63, 29
+; GISEL-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GISEL-NEXT:    v_readfirstlane_b32 s7, v1
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
-; GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s9, v1
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB9_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_readlane_b32 s63, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s62, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s61, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s60, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s59, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s58, v40, 26
-; GISEL-NEXT:    v_readlane_b32 s57, v40, 25
-; GISEL-NEXT:    v_readlane_b32 s56, v40, 24
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 23
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 22
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 21
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 20
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; GISEL-NEXT:    v_readlane_b32 s63, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s62, v40, 28
+; GISEL-NEXT:    v_readlane_b32 s61, v40, 27
+; GISEL-NEXT:    v_readlane_b32 s60, v40, 26
+; GISEL-NEXT:    v_readlane_b32 s59, v40, 25
+; GISEL-NEXT:    v_readlane_b32 s58, v40, 24
+; GISEL-NEXT:    v_readlane_b32 s57, v40, 23
+; GISEL-NEXT:    v_readlane_b32 s56, v40, 22
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 21
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 20
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s45, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
+; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
+; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    v_readlane_b32 s33, v40, 32
-; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GISEL-NEXT:    v_readlane_b32 s33, v40, 30
+; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-NEXT:    s_setpc_b64 s[4:5]
   tail call amdgpu_gfx void %fptr()
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index d3e18dbaaa6d9..b303e69064a02 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,15 +8,15 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5242890 /* regdef:SGPR_128 */, def %4
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5767178 /* regdef:SGPR_128 */, def %4
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5242889 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5767177 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5242890 /* regdef:SGPR_128 */, def %4
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5767178 /* regdef:SGPR_128 */, def %4
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5242889 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5767177 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -26,15 +26,15 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:VReg_128 */, def %4
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5308426 /* regdef:VReg_128 */, def %4
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %4
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5308425 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4980746 /* regdef:VReg_128_Align2 */, def %4
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5505034 /* regdef:VReg_128_Align2 */, def %4
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4980745 /* reguse:VReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5505033 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -44,15 +44,15 @@ define amdgpu_kernel void @v_input_output_i128() {
 define amdgpu_kernel void @a_input_output_i128() {
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4718602 /* regdef:AReg_128 */, def %4
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5242890 /* regdef:AReg_128 */, def %4
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %4
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4718601 /* reguse:AReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5242889 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4915210 /* regdef:AReg_128_Align2 */, def %4
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5439498 /* regdef:AReg_128_Align2 */, def %4
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5439497 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)

diff  --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 6e4752b05110a..1466d3b76bec0 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -40,16 +40,20 @@ define amdgpu_kernel void @kernel_call() #0 {
 }
 
 ; GCN-LABEL: {{^}}func_regular_call:
+; GCN-NOT: buffer_store
 ; GCN-NOT: buffer_load
 ; GCN-NOT: readlane
-; GCN: flat_load_dword v9
+; GCN-NOT: writelane
+; GCN: flat_load_dword v8
 ; GCN: s_swappc_b64
+; GCN-NOT: buffer_store
 ; GCN-NOT: buffer_load
 ; GCN-NOT: readlane
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v9
+; GCN-NOT: writelane
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
-; GCN: ; NumSgprs: 34
-; GCN: ; NumVgprs: 10
+; GCN: ; NumSgprs: 32
+; GCN: ; NumVgprs: 9
 define void @func_regular_call() #1 {
   %vgpr = load volatile i32, i32 addrspace(1)* undef
   tail call void @func()
@@ -72,13 +76,13 @@ define void @func_tail_call() #1 {
 }
 
 ; GCN-LABEL: {{^}}func_call_tail_call:
-; GCN: flat_load_dword v9
+; GCN: flat_load_dword v8
 ; GCN: s_swappc_b64
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v9
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 ; GCN: s_setpc_b64
 
-; GCN: ; NumSgprs: 34
-; GCN: ; NumVgprs: 10
+; GCN: ; NumSgprs: 32
+; GCN: ; NumVgprs: 9
 define void @func_call_tail_call() #1 {
   %vgpr = load volatile i32, i32 addrspace(1)* undef
   tail call void @func()
@@ -93,11 +97,8 @@ define void @void_func_void() noinline {
 
 ; Make sure we don't get save/restore of FP between calls.
 ; GCN-LABEL: {{^}}test_funcx2:
-; GCN: s_getpc_b64
+; GCN-NOT: s5
 ; GCN-NOT: s32
-; GCN: s_swappc_b64
-; GCN-NOT: s32
-; GCN: s_swappc_b64
 define void @test_funcx2() #0 {
   call void @void_func_void()
   call void @void_func_void()

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
index 581bb7233e505..9b6143d7a952b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
@@ -197,8 +197,6 @@ define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #
 ; GCN-NOT: s8
 ; GCN-NOT: s9
 ; GCN-NOT: s[8:9]
-; GCN: s_swappc_b64
-; GCN: s_setpc_b64 s[30:31]
 define void @func_call_implicitarg_ptr_func() #0 {
   call void @func_implicitarg_ptr()
   ret void
@@ -208,8 +206,6 @@ define void @func_call_implicitarg_ptr_func() #0 {
 ; GCN-NOT: s8
 ; GCN-NOT: s9
 ; GCN-NOT: s[8:9]
-; GCN: s_swappc_b64
-; GCN: s_setpc_b64 s[30:31]
 define void @opencl_func_call_implicitarg_ptr_func() #0 {
   call void @func_implicitarg_ptr()
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/llvm.powi.ll
index a95df62680343..7838d5ab8defd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.powi.ll
@@ -57,9 +57,9 @@ define float @v_powi_neg1_f32(float %l) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX7-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX7-NEXT:    v_fma_f32 v2, v3, v2, v2
 ; GFX7-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
-; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
@@ -104,9 +104,9 @@ define float @v_powi_neg2_f32(float %l) {
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX7-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX7-NEXT:    v_fma_f32 v2, v3, v2, v2
 ; GFX7-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
-; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
@@ -200,9 +200,9 @@ define float @v_powi_neg128_f32(float %l) {
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX7-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GFX7-NEXT:    v_fma_f32 v2, v3, v2, v2
 ; GFX7-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
-; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 7ae67bad68aea..2fac70d96bc31 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -193,22 +193,22 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, foo at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v42, v0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v42, v41
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    v_and_b32_e32 v43, 0xffffff, v41
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -220,17 +220,17 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s35, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
   %b = and i32 %b.arg, 16777215
   %s = and i32 %s.arg, 16777215
 

diff  --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 109163bf73737..68646ad229a7c 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -44,11 +44,11 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[10:11]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; clobber csr v40
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; CHECK-NEXT:    v_readlane_b32 s33, v1, 2
@@ -168,8 +168,8 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[10:11]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; CHECK-NEXT:    v_readlane_b32 s33, v1, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], -1
@@ -202,8 +202,8 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[2:3], s[10:11]
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v2, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; CHECK-NEXT:    v_readlane_b32 s33, v2, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], -1

diff  --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index 2ae8530e1ed19..327136fdfe1ed 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -22,8 +22,8 @@ declare void @external_void_func_i32(i32) #0
 
 ; GCN: s_swappc_b64
 
-; GCN: v_readlane_b32 s31, v40, 1
-; GCN: v_readlane_b32 s30, v40, 0
+; GCN: v_readlane_b32 s4, v40, 0
+; GCN: v_readlane_b32 s5, v40, 1
 
 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
 ; GCN-NEXT: v_readlane_b32 s33, v40, 2
@@ -31,7 +31,7 @@ declare void @external_void_func_i32(i32) #0
 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-NEXT: s_setpc_b64 s[4:5]
 define void @test_func_call_external_void_func_i32_imm() #0 {
   call void @external_void_func_i32(i32 42)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir b/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir
index d5abbc0172604..4f4bdcd6f5b41 100644
--- a/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir
@@ -30,7 +30,8 @@ body:             |
   ; GFX9-LABEL: name: index_vgpr_waterfall_loop
   ; GFX9: bb.0:
   ; GFX9:   successors: %bb.1(0x80000000)
-  ; GFX9:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16
+  ; GFX9:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $sgpr30_sgpr31
+  ; GFX9:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
   ; GFX9:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr16
   ; GFX9:   undef %18.sub15:vreg_512 = COPY $vgpr15
   ; GFX9:   %18.sub14:vreg_512 = COPY $vgpr14
@@ -62,6 +63,7 @@ body:             |
   ; GFX9:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; GFX9: bb.2:
   ; GFX9:   $exec = S_MOV_B64 [[S_MOV_B64_]]
+  ; GFX9:   $sgpr30_sgpr31 = COPY [[COPY]]
   ; GFX9:   $vgpr0 = COPY [[V_MOV_B32_e32_]]
   ; GFX9:   S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit undef $vgpr1, implicit undef $vgpr2, implicit undef $vgpr3
   bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 1e424ecde23d5..84643f945ac5f 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -19,10 +19,10 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
 ; CHECK-NEXT:    v_writelane_b32 v40, s33, 2
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0x400
-; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    .loc 0 31 3 prologue_end ; lane-info.cpp:31:3
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    .loc 0 31 3 prologue_end ; lane-info.cpp:31:3
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _ZL13sleep_foreverv at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _ZL13sleep_foreverv at gotpcrel32@hi+12
@@ -34,14 +34,14 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:  .Ltmp1:
-; CHECK-NEXT:    .loc 0 32 1 ; lane-info.cpp:32:1
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; CHECK-NEXT:    v_readlane_b32 s33, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    .loc 0 32 1 ; lane-info.cpp:32:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .Ltmp2:

diff  --git a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
index d474ea5257994..e0e211243771f 100644
--- a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
@@ -11,8 +11,8 @@ define void @child_function() #0 {
 ; GCN:  v_writelane_b32 v255, s30, 0
 ; GCN:  v_writelane_b32 v255, s31, 1
 ; GCN:  s_swappc_b64 s[30:31], s[4:5]
-; GCN:  v_readlane_b32 s31, v255, 1
 ; GCN:  v_readlane_b32 s30, v255, 0
+; GCN:  v_readlane_b32 s31, v255, 1
 ; GCN:  v_readlane_b32 s33, v255, 2
 ; GCN: ; NumVgprs: 256
 
@@ -57,8 +57,8 @@ define void @reserve_vgpr_with_no_lower_vgpr_available() #0 {
 ; GCN:  v_writelane_b32 v254, s30, 0
 ; GCN:  v_writelane_b32 v254, s31, 1
 ; GCN:  s_swappc_b64 s[30:31], s[4:5]
-; GCN:  v_readlane_b32 s31, v254, 1
 ; GCN:  v_readlane_b32 s30, v254, 0
+; GCN:  v_readlane_b32 s31, v254, 1
 ; GCN:  v_readlane_b32 s33, v254, 2
 
 define void @reserve_lowest_available_vgpr() #0 {
@@ -99,8 +99,8 @@ define void @reserve_lowest_available_vgpr() #0 {
 ; GCN-LABEL: {{^}}reserve_vgpr_with_sgpr_spills:
 ; GCN-NOT:  buffer_store_dword v255, off, s[0:3], s32
 ; GCN: ; def s4
-; GCN: v_writelane_b32 v254, s4, 0
-; GCN: v_readlane_b32 s4, v254, 0
+; GCN: v_writelane_b32 v254, s4, 2
+; GCN: v_readlane_b32 s4, v254, 2
 ; GCN: ; use s4
 
 define void @reserve_vgpr_with_sgpr_spills() #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/save-fp.ll b/llvm/test/CodeGen/AMDGPU/save-fp.ll
index ec56f41aa1a0a..6bed1dbf901e9 100644
--- a/llvm/test/CodeGen/AMDGPU/save-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/save-fp.ll
@@ -11,14 +11,16 @@ bb:
 
 ; GCN-LABEL: {{^}}caller:
 
-; GCN:     v_writelane_b32 v2, s33, 2
-; GCN:     s_mov_b32 s33, s32
+; GFX900:     s_mov_b32 [[SAVED_FP:s[0-9]+]], s33
+; GFX900:     s_mov_b32 s33, s32
+; GFX908-NOT: s_mov_b32 s33, s32
 ; GFX900:     buffer_store_dword
+; GFX908-DAG: s_mov_b32 [[SAVED_FP:s[0-9]+]], s33
 ; GFX908-DAG: v_accvgpr_write_b32
 ; GCN:        s_swappc_b64
 ; GFX900:     buffer_load_dword
 ; GFX908:     v_accvgpr_read_b32
-; GCN:        v_readlane_b32 s33, v2, 2
+; GCN:        s_mov_b32 s33, [[SAVED_FP]]
 define i64 @caller() {
 bb:
   call void asm sideeffect "", "~{v40}" ()

diff  --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 054d4ef3651e4..fccfb8fe2f212 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -201,18 +201,18 @@ entry:
 ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
 ; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec
-; GCN: v_writelane_b32 [[CSRV]], s33, 2
+; GCN: s_mov_b32 s33, s32
 ; GCN-DAG: s_addk_i32 s32, 0x400
 
+; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-DAG: v_writelane_b32 [[CSRV]], s34, 0
+; GCN-DAG: v_writelane_b32 [[CSRV]], s35, 1
+
 ; GCN-DAG: s_getpc_b64 s[4:5]
 ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
 ; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
 
-; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0
-; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1
-
 
 ; GCN: s_swappc_b64
 
@@ -223,8 +223,8 @@ entry:
 ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
 
-; GCN-DAG: v_readlane_b32 s30, [[CSRV]], 0
-; GCN-DAG: v_readlane_b32 s31, [[CSRV]], 1
+; GCN-DAG: v_readlane_b32 s34, [[CSRV]], 0
+; GCN-DAG: v_readlane_b32 s35, [[CSRV]], 1
 
 ; GCN: s_addk_i32 s32, 0xfc00
 ; GCN-NEXT: v_readlane_b32 s33,

diff  --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
index d7e368f4d6d9f..23714f0323ef3 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
@@ -89,11 +89,11 @@ body:             |
   ; VR:   renamable $sgpr37 = S_MOV_B32 -1
   ; VR:   renamable $sgpr36 = S_MOV_B32 -1
   ; VR:   renamable $sgpr68 = S_MOV_B32 0
-  ; VR:   renamable $sgpr30_sgpr31 = IMPLICIT_DEF
   ; VR:   renamable $sgpr34_sgpr35 = IMPLICIT_DEF
+  ; VR:   renamable $sgpr66_sgpr67 = IMPLICIT_DEF
   ; VR: bb.1:
   ; VR:   successors: %bb.2(0x80000000)
-  ; VR:   liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003, $sgpr30_sgpr31, $sgpr34_sgpr35
+  ; VR:   liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003, $sgpr34_sgpr35, $sgpr66_sgpr67
   ; VR:   renamable $sgpr38 = COPY renamable $sgpr36
   ; VR:   renamable $sgpr39 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr40 = COPY renamable $sgpr36
@@ -155,8 +155,8 @@ body:             |
   ; VR:   renamable $sgpr99 = COPY renamable $sgpr68
   ; VR: bb.2:
   ; VR:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; VR:   liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003, $sgpr30_sgpr31, $sgpr34_sgpr35
-  ; VR:   S_NOP 0, csr_amdgpu_highregs, implicit renamable $sgpr30_sgpr31, implicit renamable $sgpr34_sgpr35
+  ; VR:   liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003, $sgpr34_sgpr35, $sgpr66_sgpr67
+  ; VR:   S_NOP 0, csr_amdgpu_highregs, implicit renamable $sgpr34_sgpr35, implicit renamable $sgpr66_sgpr67
   ; VR:   S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
   ; VR:   S_BRANCH %bb.2
   bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 37df92e9700a8..c7031a8921504 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -171,15 +171,12 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
 ; GCN: s_swappc_b64 s[30:31],
 
-; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1
-; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0
 ; GCN: s_add_i32 s32, s32, 0xfffd0000
 ; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2
 ; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN: s_setpc_b64 s[30:31]
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
   %temp = alloca i32, align 1024, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %temp, align 1024
   call void @extern_func(<32 x i32> %a, i32 %b)

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
index bece8aab1e3d2..884b56e7a5304 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
@@ -186,13 +186,13 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX10-NEXT:    v_add_f16_e32 v5, v0, v2
+; GFX10-NEXT:    v_add_f16_e32 v4, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    v_add_f16_e32 v6, v1, v3
 ; GFX10-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_b32_e32 v2, v4, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, v5, v6
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
index 00aa96fea128a..34c5e908ac310 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
@@ -186,13 +186,13 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX10-NEXT:    v_mul_f16_e32 v5, v0, v2
+; GFX10-NEXT:    v_mul_f16_e32 v4, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    v_mul_f16_e32 v6, v1, v3
 ; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_b32_e32 v2, v4, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, v5, v6
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index 802c08fe64a26..4cf75f68a7a1d 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -206,13 +206,13 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX10-NEXT:    v_sub_f16_e32 v5, v0, v2
+; GFX10-NEXT:    v_sub_f16_e32 v4, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    v_sub_f16_e32 v6, v1, v3
 ; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_b32_e32 v2, v4, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, v5, v6
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
index fba11bf0124ef..4894eeca58253 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
@@ -20,28 +20,25 @@ define amdgpu_gfx float @caller(float %arg0) {
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    v_writelane_b32 v1, s33, 3
-; GCN-NEXT:    v_writelane_b32 v1, s4, 0
+; GCN-NEXT:    v_writelane_b32 v1, s33, 1
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    v_writelane_b32 v1, s30, 1
+; GCN-NEXT:    v_writelane_b32 v1, s4, 0
 ; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    s_mov_b32 s4, 2.0
-; GCN-NEXT:    v_writelane_b32 v1, s31, 2
-; GCN-NEXT:    s_getpc_b64 s[34:35]
-; GCN-NEXT:    s_add_u32 s34, s34, callee at rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s35, s35, callee at rel32@hi+12
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GCN-NEXT:    v_readlane_b32 s31, v1, 2
-; GCN-NEXT:    v_readlane_b32 s30, v1, 1
+; GCN-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GCN-NEXT:    s_getpc_b64 s[30:31]
+; GCN-NEXT:    s_add_u32 s30, s30, callee at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s31, s31, callee at rel32@hi+12
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GCN-NEXT:    v_readlane_b32 s4, v1, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v1, 3
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GCN-NEXT:    v_readlane_b32 s33, v1, 1
+; GCN-NEXT:    s_or_saveexec_b64 s[30:31], -1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    s_mov_b64 exec, s[30:31]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[36:37]
   %add = fadd float %arg0, 1.0
   %call = tail call amdgpu_gfx float @callee(float %add, float inreg 2.0)
   ret float %call

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index b991d6cd5149e..d678b64f54b69 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -2500,7 +2500,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
 ; SI-NEXT:    v_rcp_f32_e32 v2, v2
 ; SI-NEXT:    s_mov_b32 s4, 0xfffe7960
-; SI-NEXT:    v_mov_b32_e32 v8, 0
+; SI-NEXT:    v_mov_b32_e32 v9, 0
 ; SI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; SI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; SI-NEXT:    v_trunc_f32_e32 v3, v3
@@ -2508,22 +2508,22 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; SI-NEXT:    v_mul_hi_u32 v4, v2, s4
-; SI-NEXT:    v_mul_lo_u32 v6, v3, s4
-; SI-NEXT:    v_mul_lo_u32 v5, v2, s4
+; SI-NEXT:    v_mul_lo_u32 v5, v3, s4
+; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
 ; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; SI-NEXT:    v_mul_hi_u32 v7, v2, v5
-; SI-NEXT:    v_mul_lo_u32 v6, v2, v4
-; SI-NEXT:    v_mul_hi_u32 v9, v2, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; SI-NEXT:    v_mul_lo_u32 v5, v2, v4
+; SI-NEXT:    v_mul_hi_u32 v7, v2, v6
+; SI-NEXT:    v_mul_hi_u32 v8, v2, v4
 ; SI-NEXT:    v_mul_hi_u32 v10, v3, v4
 ; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
-; SI-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; SI-NEXT:    v_mul_lo_u32 v9, v3, v5
-; SI-NEXT:    v_mul_hi_u32 v5, v3, v5
-; SI-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; SI-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
+; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; SI-NEXT:    v_mul_lo_u32 v8, v3, v6
+; SI-NEXT:    v_mul_hi_u32 v6, v3, v6
+; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, v10, v9, vcc
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -2536,16 +2536,16 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; SI-NEXT:    v_mul_lo_u32 v5, v2, v4
 ; SI-NEXT:    v_mul_hi_u32 v7, v2, v6
-; SI-NEXT:    v_mul_hi_u32 v9, v2, v4
+; SI-NEXT:    v_mul_hi_u32 v8, v2, v4
 ; SI-NEXT:    v_mul_hi_u32 v10, v3, v4
 ; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; SI-NEXT:    v_mul_lo_u32 v9, v3, v6
+; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; SI-NEXT:    v_mul_lo_u32 v8, v3, v6
 ; SI-NEXT:    v_mul_hi_u32 v6, v3, v6
-; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
-; SI-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, v10, v9, vcc
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -2561,7 +2561,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; SI-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
-; SI-NEXT:    v_addc_u32_e32 v4, vcc, v7, v8, vcc
+; SI-NEXT:    v_addc_u32_e32 v4, vcc, v7, v9, vcc
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; SI-NEXT:    v_mul_lo_u32 v4, v3, s4
@@ -2610,15 +2610,15 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
 ; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
-; VI-NEXT:    v_mul_hi_u32 v5, v6, v2
-; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
-; VI-NEXT:    v_add_u32_e32 v10, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v4, v3
+; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
+; VI-NEXT:    v_mul_hi_u32 v8, v6, v2
+; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
-; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v4, vcc
-; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
-; VI-NEXT:    v_addc_u32_e32 v2, vcc, v11, v3, vcc
+; VI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
+; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2698,15 +2698,15 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
 ; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
-; GCN-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, v6, v2
-; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
-; GCN-NEXT:    v_add_u32_e32 v10, vcc, v5, v3
+; GCN-NEXT:    v_add_u32_e32 v5, vcc, v4, v3
+; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
+; GCN-NEXT:    v_mul_hi_u32 v8, v6, v2
+; GCN-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
 ; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v4, vcc
-; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v11, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
+; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
+; GCN-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
 ; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 1f9b7437e8ba8..c124446beed9d 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -230,10 +230,10 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GCN-NEXT:    v_mul_lo_u32 v8, v6, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
+; GCN-NEXT:    v_mul_lo_u32 v9, v6, v5
+; GCN-NEXT:    v_mul_hi_u32 v8, v6, v4
 ; GCN-NEXT:    v_mul_lo_u32 v10, v7, v4
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GCN-NEXT:    v_mul_lo_u32 v9, v6, v4
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GCN-NEXT:    v_mul_lo_u32 v10, v4, v8
@@ -1558,7 +1558,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_madak_f32 v2, 0, v2, 0x41c00000
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    s_movk_i32 s4, 0xffe8
-; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1566,22 +1566,22 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_mul_hi_u32 v4, v2, s4
-; GCN-NEXT:    v_mul_lo_u32 v6, v3, s4
-; GCN-NEXT:    v_mul_lo_u32 v5, v2, s4
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, s4
+; GCN-NEXT:    v_mul_lo_u32 v6, v2, s4
 ; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GCN-NEXT:    v_mul_hi_u32 v7, v2, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v4
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, v2, v6
+; GCN-NEXT:    v_mul_hi_u32 v8, v2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v10, v3, v4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GCN-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -1593,16 +1593,16 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_mul_lo_u32 v5, v2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v10, v3, v4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v9, v3, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -1618,7 +1618,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v8, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, 24

diff  --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index f20504019f1f9..347b84513a530 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -51,15 +51,15 @@ define hidden void @widget() {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:  .LBB0_7: ; %UnifiedReturnBlock
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_readlane_b32 s4, v40, 0
+; GCN-NEXT:    v_readlane_b32 s5, v40, 1
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    v_readlane_b32 s33, v40, 2
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 ; SI-OPT-LABEL: @widget(
 ; SI-OPT-NEXT:  bb:
 ; SI-OPT-NEXT:    [[TMP:%.*]] = load i32, i32 addrspace(1)* null, align 16
@@ -188,7 +188,7 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_writelane_b32 v40, s33, 17
+; GCN-NEXT:    v_writelane_b32 v40, s33, 15
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x800
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
@@ -196,23 +196,21 @@ define hidden void @blam() {
 ; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s48, 15
-; GCN-NEXT:    v_writelane_b32 v40, s49, 16
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s45, 11
+; GCN-NEXT:    v_writelane_b32 v40, s46, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
 ; GCN-NEXT:    v_mov_b32_e32 v41, v31
 ; GCN-NEXT:    s_mov_b32 s44, s14
 ; GCN-NEXT:    s_mov_b32 s45, s13

diff  --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
index ba3d120be9ba2..79723603f1c1a 100644
--- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
@@ -76,21 +76,22 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; CHECK-NEXT:    s_mov_b32 s5, 0x8311eb33
 ; CHECK-NEXT:    s_mov_b32 s6, 0x20140c
 ; CHECK-NEXT:    s_mov_b32 s7, 0xb6db6db7
-; CHECK-NEXT:    s_mov_b32 s8, 0x24924924
-; CHECK-NEXT:    s_mov_b32 s9, 0xaaaaaaab
-; CHECK-NEXT:    s_mov_b32 s10, 0x2aaaaaaa
+; CHECK-NEXT:    s_mov_b32 s8, 0x49249249
+; CHECK-NEXT:    s_mov_b32 s9, 0x24924924
+; CHECK-NEXT:    s_mov_b32 s10, 0xaaaaaaab
+; CHECK-NEXT:    s_mov_b32 s11, 0x2aaaaaaa
 ; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
 ; CHECK-NEXT:    v_and_b32_e32 v1, s4, v1
 ; CHECK-NEXT:    v_and_b32_e32 v2, s4, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v2, v2, s5
 ; CHECK-NEXT:    v_mul_lo_u32 v1, v1, s7
-; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s9
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s10
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xf9dc299a, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 0x49249249, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, s8, v1
 ; CHECK-NEXT:    v_alignbit_b32 v0, v0, v0, 1
-; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s11, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s8, v1
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index b14825871cea2..00bae7afa10fa 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -240,10 +240,10 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GCN-NEXT:    v_mul_lo_u32 v8, v6, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
+; GCN-NEXT:    v_mul_lo_u32 v9, v6, v5
+; GCN-NEXT:    v_mul_hi_u32 v8, v6, v4
 ; GCN-NEXT:    v_mul_lo_u32 v10, v7, v4
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GCN-NEXT:    v_mul_lo_u32 v9, v6, v4
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GCN-NEXT:    v_mul_lo_u32 v10, v4, v8

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 46ee737d31258..9b4bc71eed41a 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -29,8 +29,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9: v_writelane_b32 v40, s30, 0
-; GFX9: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
 ; GFX9: s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
 
@@ -38,10 +37,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9: v_readlane_b32 s31, v40, 1
-; GFX9: v_readlane_b32 s30, v40, 0
 ; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9: s_setpc_b64 s[30:31]
+; GFX9: s_setpc_b64 s[4:5]
 ;
 ; GFX10-LABEL: non_preserved_vgpr_tuple8:
 ; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
@@ -66,9 +63,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT: s_getpc_b64 s[4:5]
 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
-; GFX10: v_writelane_b32 v40, s30, 0
 ; GFX10: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10: v_writelane_b32 v40, s31, 1
 ; GFX10: s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
 
@@ -77,10 +72,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8
 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12
 
-; GFX10: v_readlane_b32 s31, v40, 1
-; GFX10: v_readlane_b32 s30, v40, 0
 ; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX10: s_setpc_b64 s[30:31]
+; GFX10: s_setpc_b64 s[4:5]
 main_body:
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
   call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
@@ -99,8 +92,6 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 
 ; GFX9-LABEL: call_preserved_vgpr_tuple8:
 ; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9: v_writelane_b32 v40, s30, 0
-; GFX9: v_writelane_b32 v40, s31, 1
 ; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
@@ -119,7 +110,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -131,10 +122,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 
-; GFX9: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
 ; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9: s_setpc_b64 s[30:31]
+; GFX9: s_setpc_b64 s[4:5]
 ;
 ; GFX10-LABEL: call_preserved_vgpr_tuple8:
 ; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -144,20 +133,15 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
 
-; GFX10: v_writelane_b32 v40, s30, 0
-; GFX10: v_mov_b32_e32 v41, v16
-; GFX10: v_mov_b32_e32 v42, v15
-; GFX10: v_mov_b32_e32 v43, v14
-; GFX10: v_mov_b32_e32 v44, v13
-; GFX10: v_writelane_b32 v40, s31, 1
-; GFX10: v_mov_b32_e32 v45, v12
 
 ; GFX10:      image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT: s_getpc_b64 s[4:5]
 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT: s_waitcnt vmcnt(0)
 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -169,10 +153,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8
 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12
 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16
-; GFX10: v_readlane_b32 s31, v40, 1
-; GFX10: v_readlane_b32 s30, v40, 0
 ; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:20
-; GFX10: s_setpc_b64 s[30:31]
+; GFX10: s_setpc_b64 s[4:5]
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
   store <4 x float> %v, <4 x float> addrspace(1)* undef

diff  --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index eecff44dec3ce..fcdbeea76fa28 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1136,8 +1136,8 @@ declare void @external_void_func_void() #1
 ; GCN-DAG: v_writelane_b32 v40, s30, 0
 ; GCN-DAG: v_writelane_b32 v40, s31, 1
 ; GCN: s_swappc_b64
-; GCN-DAG: v_readlane_b32 s30, v40, 0
-; GCN-DAG: v_readlane_b32 s31, v40, 1
+; GCN-DAG: v_readlane_b32 s4, v40, 0
+; GCN-DAG: v_readlane_b32 s5, v40, 1
 
 
 ; GFX1064: s_addk_i32 s32, 0xfc00

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index c01485cc3e8ec..6a079bc4e52d8 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -138,6 +138,8 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
 ; GFX9-O0-NEXT:    s_mov_b32 s37, s5
@@ -145,37 +147,37 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[42:43], s[38:39]
 ; GFX9-O0-NEXT:    s_mov_b64 s[40:41], s[36:37]
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s40, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s41, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s42, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s43, 3
-; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], off, s[36:39], s34
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s40, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s41, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s42, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s43, 5
+; GFX9-O0-NEXT:    s_mov_b32 s30, 0
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], off, s[36:39], s30
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s30
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s30
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v0, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[34:35], v0, s30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s30
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[34:35], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s34, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s35, 5
-; GFX9-O0-NEXT:    s_and_b64 s[34:35], s[34:35], s[36:37]
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    s_mov_b64 s[30:31], exec
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s30, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s31, 7
+; GFX9-O0-NEXT:    s_and_b64 s[30:31], s[30:31], s[34:35]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[30:31]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
@@ -183,27 +185,29 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[30:31], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[30:31]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[30:31], -1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[30:31]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:  .LBB1_2: ; %merge
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v5, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v5, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v5, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v5, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v5, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v5, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v5, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v5, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v5, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v5, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v5, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v5, 5
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -357,22 +361,22 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
-; GFX9-O0-NEXT:    s_getpc_b64 s[42:43]
-; GFX9-O0-NEXT:    s_add_u32 s42, s42, strict_wwm_called at rel32@lo+4
-; GFX9-O0-NEXT:    s_addc_u32 s43, s43, strict_wwm_called at rel32@hi+12
+; GFX9-O0-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-O0-NEXT:    s_add_u32 s30, s30, strict_wwm_called at rel32@lo+4
+; GFX9-O0-NEXT:    s_addc_u32 s31, s31, strict_wwm_called at rel32@hi+12
 ; GFX9-O0-NEXT:    s_mov_b64 s[46:47], s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 s[44:45], s[0:1]
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[44:45]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[46:47]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; GFX9-O0-NEXT:    v_readlane_b32 s33, v3, 2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -387,41 +391,36 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_writelane_b32 v3, s33, 2
-; GFX9-O3-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-O3-NEXT:    s_mov_b32 s38, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-O3-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-O3-NEXT:    s_mov_b64 s[36:37], s[30:31]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
-; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called at rel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called at rel32@hi+12
-; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-O3-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-O3-NEXT:    s_add_u32 s30, s30, strict_wwm_called at rel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s31, s31, strict_wwm_called at rel32@hi+12
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-O3-NEXT:    v_readlane_b32 s33, v3, 2
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    s_mov_b32 s33, s38
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[30:31], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[30:31]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-O3-NEXT:    s_setpc_b64 s[36:37]
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
   %tmp134 = call amdgpu_gfx i32 @strict_wwm_called(i32 %tmp107)
   %tmp136 = add i32 %tmp134, %tmp107
@@ -535,39 +534,39 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xc00
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
-; GFX9-O0-NEXT:    s_mov_b32 s36, s8
-; GFX9-O0-NEXT:    s_mov_b32 s40, s4
-; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43
-; GFX9-O0-NEXT:    s_mov_b32 s41, s5
-; GFX9-O0-NEXT:    s_mov_b32 s42, s6
-; GFX9-O0-NEXT:    s_mov_b32 s43, s7
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s40, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s41, 3
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s42, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s43, 5
-; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37
-; GFX9-O0-NEXT:    s_mov_b32 s37, s9
-; GFX9-O0-NEXT:    ; kill: def $sgpr34_sgpr35 killed $sgpr36_sgpr37
-; GFX9-O0-NEXT:    s_mov_b64 s[34:35], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s37
+; GFX9-O0-NEXT:    s_mov_b32 s34, s8
+; GFX9-O0-NEXT:    s_mov_b32 s36, s4
+; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
+; GFX9-O0-NEXT:    s_mov_b32 s37, s5
+; GFX9-O0-NEXT:    s_mov_b32 s38, s6
+; GFX9-O0-NEXT:    s_mov_b32 s39, s7
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s36, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s37, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s38, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s39, 5
+; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
+; GFX9-O0-NEXT:    s_mov_b32 s35, s9
+; GFX9-O0-NEXT:    ; kill: def $sgpr30_sgpr31 killed $sgpr34_sgpr35
+; GFX9-O0-NEXT:    s_mov_b64 s[30:31], 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s35
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s31
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s34, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s35, 7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[30:31], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O0-NEXT:    s_mov_b32 s34, 32
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s34, v[8:9]
-; GFX9-O0-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-O0-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64 at gotpcrel32@lo+4
-; GFX9-O0-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64 at gotpcrel32@hi+12
-; GFX9-O0-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-O0-NEXT:    s_mov_b32 s30, 32
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s30, v[8:9]
+; GFX9-O0-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-O0-NEXT:    s_add_u32 s30, s30, strict_wwm_called_i64 at gotpcrel32@lo+4
+; GFX9-O0-NEXT:    s_addc_u32 s31, s31, strict_wwm_called_i64 at gotpcrel32@hi+12
+; GFX9-O0-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-O0-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], s[0:1]
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[36:37]
@@ -575,13 +574,15 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-O0-NEXT:    v_readlane_b32 s34, v10, 6
 ; GFX9-O0-NEXT:    v_readlane_b32 s35, v10, 7
 ; GFX9-O0-NEXT:    v_readlane_b32 s36, v10, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s37, v10, 3
 ; GFX9-O0-NEXT:    v_readlane_b32 s38, v10, 4
 ; GFX9-O0-NEXT:    v_readlane_b32 s39, v10, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
@@ -593,8 +594,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff400
 ; GFX9-O0-NEXT:    v_readlane_b32 s33, v10, 8
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -626,7 +625,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -636,19 +634,18 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_writelane_b32 v8, s33, 2
-; GFX9-O3-NEXT:    v_writelane_b32 v8, s30, 0
+; GFX9-O3-NEXT:    s_mov_b32 s40, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-O3-NEXT:    v_writelane_b32 v8, s31, 1
+; GFX9-O3-NEXT:    s_mov_b64 s[36:37], s[30:31]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
-; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called_i64 at gotpcrel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called_i64 at gotpcrel32@hi+12
-; GFX9-O3-NEXT:    s_load_dwordx2 s[36:37], s[36:37], 0x0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[30:31], -1
+; GFX9-O3-NEXT:    s_getpc_b64 s[34:35]
+; GFX9-O3-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64 at gotpcrel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64 at gotpcrel32@hi+12
+; GFX9-O3-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[30:31]
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
@@ -657,7 +654,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
@@ -666,13 +663,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xf800
-; GFX9-O3-NEXT:    v_readlane_b32 s33, v8, 2
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
+; GFX9-O3-NEXT:    s_mov_b32 s33, s40
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[30:31], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -683,9 +676,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[30:31]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-O3-NEXT:    s_setpc_b64 s[36:37]
   %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
   %tmp134 = call amdgpu_gfx i64 @strict_wwm_called_i64(i64 %tmp107)
   %tmp136 = add i64 %tmp134, %tmp107

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-metadata.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-metadata.mir
index 0ef574c2b3f7e..a9fbe15b244e9 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-metadata.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-metadata.mir
@@ -77,9 +77,10 @@ machineMetadataNodes:
   - '!7 = distinct !{!7, !"MemcpyLoweringDomain"}'
 body:             |
   bb.0 (%ir-block.0):
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_memcpy
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
@@ -95,8 +96,11 @@ body:             |
     ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0
     ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub1
     ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[COPY8]], killed [[COPY9]], 0, implicit $exec
+    ; CHECK: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
     ; CHECK: $vgpr0 = COPY [[V_ADD_U32_e64_]]
-    ; CHECK: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
+    ; CHECK: S_SETPC_B64_return [[COPY11]], implicit $vgpr0
+    %4:sreg_64 = COPY $sgpr30_sgpr31
     %3:vgpr_32 = COPY $vgpr3
     %2:vgpr_32 = COPY $vgpr2
     %1:vgpr_32 = COPY $vgpr1
@@ -112,8 +116,10 @@ body:             |
     %13:vgpr_32 = COPY %11.sub0
     %14:vgpr_32 = COPY %11.sub1
     %15:vgpr_32 = V_ADD_U32_e64 killed %13, killed %14, 0, implicit $exec
+    %5:ccr_sgpr_64 = COPY %4
     $vgpr0 = COPY %15
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %16:ccr_sgpr_64 = COPY %5
+    S_SETPC_B64_return %16, implicit $vgpr0
 
 ...
 ---
@@ -128,9 +134,10 @@ machineMetadataNodes:
   - '!10 = !{!1, !9}'
 body:             |
   bb.0 (%ir-block.0):
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
 
     ; CHECK-LABEL: name: test_memcpy_inline
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
@@ -146,8 +153,11 @@ body:             |
     ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0
     ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub1
     ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[COPY8]], killed [[COPY9]], 0, implicit $exec
+    ; CHECK: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
     ; CHECK: $vgpr0 = COPY [[V_ADD_U32_e64_]]
-    ; CHECK: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; CHECK: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
+    ; CHECK: S_SETPC_B64_return [[COPY11]], implicit $vgpr0
+    %4:sreg_64 = COPY $sgpr30_sgpr31
     %3:vgpr_32 = COPY $vgpr3
     %2:vgpr_32 = COPY $vgpr2
     %1:vgpr_32 = COPY $vgpr1
@@ -163,7 +173,9 @@ body:             |
     %13:vgpr_32 = COPY %11.sub0
     %14:vgpr_32 = COPY %11.sub1
     %15:vgpr_32 = V_ADD_U32_e64 killed %13, killed %14, 0, implicit $exec
+    %5:ccr_sgpr_64 = COPY %4
     $vgpr0 = COPY %15
-    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %16:ccr_sgpr_64 = COPY %5
+    S_SETPC_B64_return %16, implicit $vgpr0
 
 ...

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
index 6e9f4dacc56c0..7b2ef70da2d66 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
@@ -17,9 +17,11 @@
 
 ---
 name:            foo
+liveins:
+  - { reg: '$sgpr30_sgpr31', virtual-reg: '' }
 body:             |
   bb.0:
-    SI_RETURN
+    S_SETPC_B64_return killed renamable $sgpr30_sgpr31
 
 ...
 ---
@@ -29,6 +31,7 @@ liveins:
   - { reg: '$vgpr0', virtual-reg: '' }
   - { reg: '$vgpr1', virtual-reg: '' }
   - { reg: '$vgpr2', virtual-reg: '' }
+  - { reg: '$sgpr30_sgpr31', virtual-reg: '' }
 stack:
   - { id: 0, name: '', type: spill-slot, offset: 0, size: 8, alignment: 4,
       stack-id: sgpr-spill, callee-saved-register: '', callee-saved-restored: true,
@@ -38,7 +41,7 @@ machineFunctionInfo:
   stackPtrOffsetReg: '$sgpr32'
 body:             |
   bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
 
     renamable $vgpr41 = COPY $vgpr2, implicit $exec
     renamable $vgpr40 = COPY $vgpr1, implicit $exec
@@ -49,10 +52,12 @@ body:             |
     ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
     renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @foo + 4, target-flags(amdgpu-gotprel32-hi) @foo + 12, implicit-def dead $scc
     renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0
+    SI_SPILL_S64_SAVE killed renamable $sgpr30_sgpr31, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
     dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @foo, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    renamable $sgpr30_sgpr31 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
     ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
 
-    SI_RETURN
+    S_SETPC_B64_return killed renamable $sgpr30_sgpr31
 
 ...
 


        


More information about the llvm-commits mailing list