[llvm] 76cbe62 - [AMDGPU] Changes the AMDGPU_Gfx calling convention by making the SGPRs 4..29 callee-save. This is to avoid superfluous s_movs when executing amdgpu_gfx function calls as the callee is likely not going to change the argument values.

Thomas Symalla via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 4 13:50:24 PDT 2021


Author: Thomas Symalla
Date: 2021-11-04T21:50:18+01:00
New Revision: 76cbe62262a3ccd0604599a1103ce5d38ac0164b

URL: https://github.com/llvm/llvm-project/commit/76cbe62262a3ccd0604599a1103ce5d38ac0164b
DIFF: https://github.com/llvm/llvm-project/commit/76cbe62262a3ccd0604599a1103ce5d38ac0164b.diff

LOG: [AMDGPU] Changes the AMDGPU_Gfx calling convention by making the SGPRs 4..29 callee-save. This is to avoid superfluous s_movs when executing amdgpu_gfx function calls as the callee is likely not going to change the argument values.

This patch changes the AMDGPU_Gfx calling convention. It defines the SGPR registers s[4:29] as callee-save and leaves some SGPRs usable for callers. The intention is to avoid unneccessary s_mov instructions for arguments the caller would otherwise save and restore in these registers.

Reviewed By: sebastian-ne

Differential Revision: https://reviews.llvm.org/D111637

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
    llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/lib/Target/AMDGPU/SIRegisterInfo.td
    llvm/lib/Target/AMDGPU/SOPInstructions.td
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
    llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
    llvm/test/CodeGen/AMDGPU/indirect-call.ll
    llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
    llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
    llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
    llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
    llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 118a32580f66..43928d7c2a09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -355,14 +355,23 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
 
   auto const &ST = MF.getSubtarget<GCNSubtarget>();
 
-  unsigned ReturnOpc =
-      IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
+  unsigned ReturnOpc = 0;
+  if (IsShader)
+    ReturnOpc = AMDGPU::SI_RETURN_TO_EPILOG;
+  else if (CC == CallingConv::AMDGPU_Gfx)
+    ReturnOpc = AMDGPU::S_SETPC_B64_return_gfx;
+  else
+    ReturnOpc = AMDGPU::S_SETPC_B64_return;
 
   auto Ret = B.buildInstrNoInsert(ReturnOpc);
   Register ReturnAddrVReg;
   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
     ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
     Ret.addUse(ReturnAddrVReg);
+  } else if (ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
+    ReturnAddrVReg =
+        MRI.createVirtualRegister(&AMDGPU::Gfx_CCR_SGPR_64RegClass);
+    Ret.addUse(ReturnAddrVReg);
   }
 
   if (!FLI.CanLowerReturn)
@@ -370,7 +379,8 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
   else if (!lowerReturnVal(B, Val, VRegs, Ret))
     return false;
 
-  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
+  if (ReturnOpc == AMDGPU::S_SETPC_B64_return ||
+      ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
     const SIRegisterInfo *TRI = ST.getRegisterInfo();
     Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
                                          &AMDGPU::SGPR_64RegClass);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 34fb7b840d31..1682d43ae671 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -20,11 +20,13 @@ def CC_SI_Gfx : CallingConv<[
   // 0-3 are reserved for the stack buffer descriptor
   // 30-31 are reserved for the return address
   // 32 is reserved for the stack pointer
+  // 33 is reserved for the frame pointer
+  // 34 is reserved for the base pointer
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
     SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
-    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29
   ]>>>,
 
   CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
@@ -41,17 +43,6 @@ def RetCC_SI_Gfx : CallingConv<[
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
-  // 0-3 are reserved for the stack buffer descriptor
-  // 32 is reserved for the stack pointer
-  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
-    SGPR4, SGPR5, SGPR6, SGPR7,
-    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
-    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
-    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
-    SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
-    SGPR40, SGPR41, SGPR42, SGPR43
-  ]>>>,
-
   CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
@@ -165,6 +156,14 @@ def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
   (sequence "SGPR%u", 32, 105)
 >;
 
+def CSR_AMDGPU_SI_Gfx_SGPRs_4_29 : CalleeSavedRegs<
+  (sequence "SGPR%u", 4, 29)
+>;
+
+def CSR_AMDGPU_SI_Gfx_SGPRs_64_105 : CalleeSavedRegs<
+  (sequence "SGPR%u", 64, 105)
+>;
+
 // Just to get the regmask, not for calling convention purposes.
 def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs<
   (sequence "VGPR%u", 0, 255)
@@ -190,6 +189,14 @@ def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs<
   (add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255)
 >;
 
+def CSR_AMDGPU_SI_Gfx : CalleeSavedRegs<
+  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs_4_29, CSR_AMDGPU_SI_Gfx_SGPRs_64_105)
+>;
+
+def CSR_AMDGPU_SI_Gfx_With_AGPRs : CalleeSavedRegs<
+  (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs_32_255)
+>;
+
 def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
 
 // Calling convention for leaf functions

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 16d9b302348c..b2b4882afe0e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4352,6 +4352,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(TC_RETURN)
   NODE_NAME_CASE(TRAP)
   NODE_NAME_CASE(RET_FLAG)
+  NODE_NAME_CASE(RET_GFX_FLAG)
   NODE_NAME_CASE(RETURN_TO_EPILOG)
   NODE_NAME_CASE(ENDPGM)
   NODE_NAME_CASE(DWORDADDR)

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 6bbcef2fe721..03632ac18598 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -343,7 +343,7 @@ namespace AMDGPUISD {
 enum NodeType : unsigned {
   // AMDIL ISD Opcodes
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  UMUL,        // 32bit unsigned multiplication
+  UMUL, // 32bit unsigned multiplication
   BRANCH_COND,
   // End AMDIL ISD Opcodes
 
@@ -366,6 +366,9 @@ enum NodeType : unsigned {
   // Return with values from a non-entry function.
   RET_FLAG,
 
+  // Return with values from a non-entry function (AMDGPU_Gfx CC).
+  RET_GFX_FLAG,
+
   DWORDADDR,
   FRACT,
 
@@ -422,10 +425,10 @@ enum NodeType : unsigned {
   DOT4,
   CARRY,
   BORROW,
-  BFE_U32, // Extract range of bits with zero extension to 32-bits.
-  BFE_I32, // Extract range of bits with sign extension to 32-bits.
-  BFI, // (src0 & src1) | (~src0 & src2)
-  BFM, // Insert a range of bits into a 32-bit word.
+  BFE_U32,  // Extract range of bits with zero extension to 32-bits.
+  BFE_I32,  // Extract range of bits with sign extension to 32-bits.
+  BFI,      // (src0 & src1) | (~src0 & src2)
+  BFM,      // Insert a range of bits into a 32-bit word.
   FFBH_U32, // ctlz with -1 if input is zero.
   FFBH_I32,
   FFBL_B32, // cttz with -1 if input is zero.
@@ -534,7 +537,6 @@ enum NodeType : unsigned {
   LAST_AMDGPU_ISD_NUMBER
 };
 
-
 } // End namespace AMDGPUISD
 
 } // End namespace llvm

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 55e03ae35ce0..391dc8428539 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -359,6 +359,10 @@ def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPt
   [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
 >;
 
+def AMDGPUret_gfx_flag : SDNode<"AMDGPUISD::RET_GFX_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
 
 //===----------------------------------------------------------------------===//
 // Intrinsic/Custom node compatibility PatFrags

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 99e72815db2e..ef86d760184a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -120,7 +120,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
   // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
   // need to select it to the subtarget specific version, and there's no way to
   // do that with a single pseudo source operation.
-  if (Opcode == AMDGPU::S_SETPC_B64_return)
+  if (Opcode == AMDGPU::S_SETPC_B64_return ||
+      Opcode == AMDGPU::S_SETPC_B64_return_gfx)
     Opcode = AMDGPU::S_SETPC_B64;
   else if (Opcode == AMDGPU::SI_CALL) {
     // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a3be162432da..877b4500364e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2616,9 +2616,12 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     SDValue ReturnAddrReg = CreateLiveInRegister(
       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
 
-    SDValue ReturnAddrVirtualReg = DAG.getRegister(
-        MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass),
-        MVT::i64);
+    SDValue ReturnAddrVirtualReg =
+        DAG.getRegister(MF.getRegInfo().createVirtualRegister(
+                            CallConv != CallingConv::AMDGPU_Gfx
+                                ? &AMDGPU::CCR_SGPR_64RegClass
+                                : &AMDGPU::Gfx_CCR_SGPR_64RegClass),
+                        MVT::i64);
     Chain =
         DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
     Flag = Chain.getValue(1);
@@ -2681,8 +2684,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetOps.push_back(Flag);
 
   unsigned Opc = AMDGPUISD::ENDPGM;
-  if (!IsWaveEnd)
-    Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
+  if (!IsWaveEnd) {
+    if (IsShader)
+      Opc = AMDGPUISD::RETURN_TO_EPILOG;
+    else if (CallConv == CallingConv::AMDGPU_Gfx)
+      Opc = AMDGPUISD::RET_GFX_FLAG;
+    else
+      Opc = AMDGPUISD::RET_FLAG;
+  }
+
   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7d6f79922d2e..ea36ee46bea1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -963,6 +963,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   //   with knowledge of the called routines.
   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+      MI.getOpcode() == AMDGPU::S_SETPC_B64_return_gfx ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 36fc20bbcba0..bfbe84f696f8 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -360,10 +360,13 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
-  case CallingConv::AMDGPU_Gfx:
     return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
         ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
         : CSR_AMDGPU_HighRegs_SaveList;
+  case CallingConv::AMDGPU_Gfx:
+    return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
+               ? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList
+               : CSR_AMDGPU_SI_Gfx_SaveList;
   default: {
     // Dummy to not crash RegisterClassInfo.
     static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
@@ -383,10 +386,13 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
-  case CallingConv::AMDGPU_Gfx:
     return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
         ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
         : CSR_AMDGPU_HighRegs_RegMask;
+  case CallingConv::AMDGPU_Gfx:
+    return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
+               ? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask
+               : CSR_AMDGPU_SI_Gfx_RegMask;
   default:
     return nullptr;
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 49dbb895ef2c..efae89969487 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -691,6 +691,14 @@ def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
   let AllocationPriority = SGPR_64.AllocationPriority;
 }
 
+// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC
+def Gfx_CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
+                                (add (trunc (shl SGPR_64, 15), 1), // s[30:31]
+                                     (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63]
+  let CopyCost = SGPR_64.CopyCost;
+  let AllocationPriority = SGPR_64.AllocationPriority;
+}
+
 def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
                             (add TTMP_64Regs)> {
   let isAllocatable = 0;

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2c2a31898ef8..61ecc13620a1 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -265,6 +265,7 @@ def S_SETPC_B64 : SOP1_1  <"s_setpc_b64">;
 let isReturn = 1 in {
 // Define variant marked as return rather than branch.
 def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>;
+def S_SETPC_B64_return_gfx : SOP1_1<"", Gfx_CCR_SGPR_64, [(AMDGPUret_gfx_flag i64:$src0)]>;
 }
 } // End isTerminator = 1, isBarrier = 1
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
index e097fcb90ffb..67cb7683fbf5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
@@ -18,10 +18,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
-  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   call amdgpu_gfx void @external_gfx_void_func_void()
   ret void
 }
@@ -39,10 +39,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
   ; CHECK-NEXT:   $vgpr0 = COPY [[C]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
-  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   call amdgpu_gfx void @external_gfx_void_func_i32(i32 42)
   ret void
 }
@@ -60,10 +60,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
   ; CHECK-NEXT:   $sgpr4 = COPY [[C]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
-  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
   ret void
 }
@@ -88,10 +88,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
   ; CHECK-NEXT:   $vgpr1 = COPY [[LOAD2]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
-  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
   call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val)
@@ -118,10 +118,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
   ; CHECK-NEXT:   $sgpr5 = COPY [[LOAD2]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
-  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
   call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index c70a348a0cb8..55449b9cca49 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -141,12 +141,12 @@ define amdgpu_gfx void @test_gfx_call_external_i32_func_i32_imm(i32 addrspace(1)
   ; GCN-NEXT:   $vgpr0 = COPY [[C]](s32)
   ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
+  ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[COPY4]](s32), [[MV]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
-  ; GCN-NEXT:   S_SETPC_B64_return [[COPY5]]
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY2]]
+  ; GCN-NEXT:   S_SETPC_B64_return_gfx [[COPY5]]
   %val = call amdgpu_gfx i32 @external_gfx_i32_func_i32(i32 42)
   store volatile i32 %val, i32 addrspace(1)* %out
   ret void
@@ -219,13 +219,13 @@ define amdgpu_gfx void @test_gfx_call_external_i1_func_void() #0 {
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i1_func_void
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
+  ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i1_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `i1 addrspace(1)* undef`, addrspace 1)
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
-  ; GCN-NEXT:   S_SETPC_B64_return [[COPY3]]
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   %val = call amdgpu_gfx i1 @external_gfx_i1_func_void()
   store volatile i1 %val, i1 addrspace(1)* undef
   ret void
@@ -415,14 +415,14 @@ define amdgpu_gfx void @test_gfx_call_external_i8_func_void() #0 {
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i8_func_void
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i8_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
+  ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i8_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
   ; GCN-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `i8 addrspace(1)* undef`, addrspace 1)
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
-  ; GCN-NEXT:   S_SETPC_B64_return [[COPY3]]
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   %val = call amdgpu_gfx i8 @external_gfx_i8_func_void()
   store volatile i8 %val, i8 addrspace(1)* undef
   ret void
@@ -784,12 +784,12 @@ define amdgpu_gfx void @test_gfx_call_external_i32_func_void() #0 {
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i32_func_void
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
+  ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[COPY2]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
-  ; GCN-NEXT:   S_SETPC_B64_return [[COPY3]]
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   %val = call amdgpu_gfx i32 @external_gfx_i32_func_void()
   store volatile i32 %val, i32 addrspace(1)* undef
   ret void
@@ -2480,7 +2480,7 @@ define amdgpu_gfx void @test_gfx_call_external_i32_i64_func_void() #0 {
   ; GCN-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i32_i64_func_void
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
-  ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_i64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
+  ; GCN-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_i64_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
   ; GCN-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -2488,8 +2488,8 @@ define amdgpu_gfx void @test_gfx_call_external_i32_i64_func_void() #0 {
   ; GCN-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN-NEXT:   G_STORE [[COPY3]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
   ; GCN-NEXT:   G_STORE [[MV]](s64), [[COPY1]](p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
-  ; GCN-NEXT:   S_SETPC_B64_return [[COPY6]]
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; GCN-NEXT:   S_SETPC_B64_return_gfx [[COPY6]]
   %val = call amdgpu_gfx { i32, i64 } @external_gfx_i32_i64_func_void()
   %val.0 = extractvalue { i32, i64 } %val, 0
   %val.1 = extractvalue { i32, i64 } %val, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index bd068cd3b45e..0071ca9275e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -156,10 +156,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
-  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   call amdgpu_gfx void @external_gfx_void_func_void()
   ret void
 }
@@ -899,10 +899,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
   ; CHECK-NEXT:   $vgpr0 = COPY [[C]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
-  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   call amdgpu_gfx void @external_gfx_void_func_i32(i32 42)
   ret void
 }
@@ -920,10 +920,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
   ; CHECK-NEXT:   $sgpr4 = COPY [[C]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
-  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY3]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY3]]
   call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
   ret void
 }
@@ -3893,10 +3893,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
   ; CHECK-NEXT:   $vgpr1 = COPY [[LOAD2]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
-  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
   call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val)
@@ -3923,10 +3923,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
   ; CHECK-NEXT:   $sgpr5 = COPY [[LOAD2]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
-  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY2]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY2]]
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
   call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
index ee15b1a0249f..e635c5b3649a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
@@ -66,10 +66,10 @@ define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(void()* %fptr) {
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[MV]](p0), 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
-  ; CHECK-NEXT:   S_SETPC_B64_return [[COPY4]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   S_SETPC_B64_return_gfx [[COPY4]]
   call amdgpu_gfx void %fptr()
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index c86cdfce05df..accc9a102636 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -153,14 +153,14 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x24{{$}}
+; GCN-NEXT:        .sgpr_count:     0x28{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; SDAG-NEXT:        .vgpr_count:     0x2{{$}}
 ; GISEL-NEXT:        .vgpr_count:     0x3{{$}}
 ; GCN-NEXT:      dynamic_stack_loop:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; SDAG-NEXT:        .sgpr_count:     0x22{{$}}
-; GISEL-NEXT:        .sgpr_count:     0x24{{$}}
+; SDAG-NEXT:        .sgpr_count:     0x25{{$}}
+; GISEL-NEXT:        .sgpr_count:     0x26{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; SDAG-NEXT:        .vgpr_count:     0x3{{$}}
 ; GISEL-NEXT:        .vgpr_count:     0x4{{$}}
@@ -176,26 +176,26 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      no_stack_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x21{{$}}
+; GCN-NEXT:        .sgpr_count:     0x26{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2{{$}}
 ; GCN-NEXT:      no_stack_extern_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x24{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x28{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x29{{$}}
 ; GCN-NEXT:      no_stack_extern_call_many_args:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x24{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x28{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x90{{$}}
 ; SDAG-NEXT:        .vgpr_count:     0x2a{{$}}
 ; GISEL-NEXT:        .vgpr_count:     0x34{{$}}
 ; GCN-NEXT:      no_stack_indirect_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x24{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x28{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; SDAG-NEXT:        .vgpr_count:     0x2a{{$}}
 ; GISEL-NEXT:        .vgpr_count:     0x34{{$}}
@@ -206,7 +206,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      simple_lds_recurse:
 ; GCN-NEXT:        .lds_size:       0x100{{$}}
-; GCN-NEXT:        .sgpr_count:     0x24{{$}}
+; GCN-NEXT:        .sgpr_count:     0x26{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x29{{$}}
 ; GCN-NEXT:      simple_stack:
@@ -216,25 +216,25 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:        .vgpr_count:     0x2{{$}}
 ; GCN-NEXT:      simple_stack_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x22{{$}}
+; GCN-NEXT:        .sgpr_count:     0x26{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x3{{$}}
 ; GCN-NEXT:      simple_stack_extern_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x24{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x28{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2a{{$}}
 ; GCN-NEXT:      simple_stack_indirect_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x24{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x28{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
 ; SDAG-NEXT:        .vgpr_count:     0x2b{{$}}
 ; GISEL-NEXT:        .vgpr_count:     0x34{{$}}
 ; GCN-NEXT:      simple_stack_recurse:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x24{{$}}
+; GCN-NEXT:        .sgpr_count:     0x26{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2a{{$}}
 ; GCN-NEXT: ...

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 16022e33b84c..29d1518ba19b 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -96,59 +96,59 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i1_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i1 at rel32@hi+12
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i1_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i1 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i1 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -169,16 +169,16 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_i1(i1 true)
   ret void
 }
@@ -187,9 +187,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i1_signext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
@@ -197,53 +197,53 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_signext at rel32@hi+12
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i1_signext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i1_signext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_signext at rel32@hi+12
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_signext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -266,16 +266,16 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %var = load volatile i1, i1 addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_i1_signext(i1 signext%var)
   ret void
@@ -285,9 +285,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
@@ -295,53 +295,53 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i1_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_zeroext at rel32@hi+12
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i1_zeroext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i1_zeroext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_zeroext at rel32@hi+12
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_zeroext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -364,16 +364,16 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %var = load volatile i1, i1 addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_i1_zeroext(i1 zeroext %var)
   ret void
@@ -383,57 +383,57 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i8_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i8 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i8 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i8_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i8 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i8 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -453,16 +453,16 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_i8(i8 123)
   ret void
 }
@@ -471,9 +471,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i8_signext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_sbyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
@@ -481,49 +481,49 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i8_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_signext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i8_signext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_sbyte v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i8_signext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_signext at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_signext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -544,16 +544,16 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %var = load volatile i8, i8 addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_i8_signext(i8 signext %var)
   ret void
@@ -563,9 +563,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i8_zeroext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
@@ -573,49 +573,49 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i8_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_zeroext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i8_zeroext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i8_zeroext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_zeroext at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_zeroext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -636,16 +636,16 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %var = load volatile i8, i8 addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_i8_zeroext(i8 zeroext %var)
   ret void
@@ -655,57 +655,57 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i16_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i16_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -725,16 +725,16 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_i16(i16 123)
   ret void
 }
@@ -743,9 +743,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i16_signext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
@@ -753,49 +753,49 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i16_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_signext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i16_signext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i16_signext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_signext at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_signext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -816,16 +816,16 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %var = load volatile i16, i16 addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_i16_signext(i16 signext %var)
   ret void
@@ -835,9 +835,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i16_zeroext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
@@ -845,49 +845,49 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i16_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_zeroext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i16_zeroext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i16_zeroext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_zeroext at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_zeroext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -908,16 +908,16 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %var = load volatile i16, i16 addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_i16_zeroext(i16 zeroext %var)
   ret void
@@ -927,57 +927,57 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i32_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -997,16 +997,16 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_i32(i32 42)
   ret void
 }
@@ -1015,9 +1015,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i64_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -1025,49 +1025,49 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i64_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i64_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1088,16 +1088,16 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_i64(i64 123)
   ret void
 }
@@ -1106,9 +1106,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
@@ -1117,50 +1117,50 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1182,16 +1182,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i64>, <2 x i64> addrspace(1)* null
   call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> %val)
   ret void
@@ -1201,9 +1201,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i64_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -1213,28 +1213,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i64_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2
@@ -1243,21 +1243,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1280,16 +1280,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
   ret void
 }
@@ -1298,9 +1298,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
@@ -1311,28 +1311,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
@@ -1342,21 +1342,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i64:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1380,16 +1380,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %load = load <2 x i64>, <2 x i64> addrspace(1)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
 
@@ -1401,9 +1401,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
@@ -1416,28 +1416,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
@@ -1449,21 +1449,21 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i64:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1489,16 +1489,16 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %load = load <2 x i64>, <2 x i64> addrspace(1)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call amdgpu_gfx void @external_void_func_v4i64(<4 x i64> %val)
@@ -1509,57 +1509,57 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f16_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_f16_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f16_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1579,16 +1579,16 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_f16(half 4.0)
   ret void
 }
@@ -1597,57 +1597,57 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 4.0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_f32_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 4.0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1667,16 +1667,16 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_f32(float 4.0)
   ret void
 }
@@ -1685,9 +1685,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -1695,49 +1695,49 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f32_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1758,16 +1758,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>)
   ret void
 }
@@ -1776,9 +1776,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -1787,28 +1787,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f32_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
@@ -1816,21 +1816,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1852,16 +1852,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>)
   ret void
 }
@@ -1870,9 +1870,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v5f32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -1883,28 +1883,28 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, -1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0.5
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v5f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v5f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v5f32_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
@@ -1914,21 +1914,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0.5
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v5f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v5f32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5f32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1952,16 +1952,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v5f32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
   ret void
 }
@@ -1970,9 +1970,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f64_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -1980,49 +1980,49 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_f64_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f64_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -2043,16 +2043,16 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_f64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_f64(double 4.0)
   ret void
 }
@@ -2061,9 +2061,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f64_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -2073,28 +2073,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f64_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
@@ -2103,21 +2103,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f64_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -2140,16 +2140,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
   ret void
 }
@@ -2158,9 +2158,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f64_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -2172,28 +2172,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x40200000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f64_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
@@ -2205,20 +2205,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f64 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f64_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -2243,16 +2243,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f64 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f64 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
   ret void
 }
@@ -2261,57 +2261,57 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i16:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -2331,16 +2331,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i16>, <2 x i16> addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_v2i16(<2 x i16> %val)
   ret void
@@ -2350,57 +2350,57 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -2420,16 +2420,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <3 x i16>, <3 x i16> addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> %val)
   ret void
@@ -2439,57 +2439,57 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -2509,16 +2509,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <3 x half>, <3 x half> addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_v3f16(<3 x half> %val)
   ret void
@@ -2528,9 +2528,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i16_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -2538,49 +2538,49 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i16_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -2601,16 +2601,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
   ret void
 }
@@ -2619,9 +2619,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f16_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -2629,49 +2629,49 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f16_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4400
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -2692,16 +2692,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>)
   ret void
 }
@@ -2710,57 +2710,57 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -2780,16 +2780,16 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <4 x i16>, <4 x i16> addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> %val)
   ret void
@@ -2799,9 +2799,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i16_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -2809,49 +2809,49 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40003
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i16_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40003
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -2872,16 +2872,16 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
   ret void
 }
@@ -2890,57 +2890,57 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f16 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f16:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -2960,16 +2960,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x half>, <2 x half> addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_v2f16(<2 x half> %val)
   ret void
@@ -2979,57 +2979,57 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3049,16 +3049,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> %val)
   ret void
@@ -3068,9 +3068,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -3078,49 +3078,49 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i32_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3141,16 +3141,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
   ret void
 }
@@ -3159,9 +3159,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -3170,28 +3170,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i32_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 4
@@ -3199,21 +3199,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3235,16 +3235,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
   ret void
 }
@@ -3253,9 +3253,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i32_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -3265,28 +3265,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i32_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 4
@@ -3295,21 +3295,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3332,16 +3332,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32_i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
   ret void
 }
@@ -3350,57 +3350,57 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3420,16 +3420,16 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
   call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> %val)
   ret void
@@ -3439,9 +3439,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -3451,28 +3451,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i32_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2
@@ -3481,21 +3481,21 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3518,16 +3518,16 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
   ret void
 }
@@ -3536,9 +3536,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v5i32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -3549,28 +3549,28 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v5i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v5i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v5i32_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2
@@ -3580,21 +3580,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 5
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v5i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v5i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3618,16 +3618,16 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v5i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
   ret void
 }
@@ -3636,66 +3636,67 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v8i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[4:5]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[4:5] offset:16
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[30:31]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[30:31] offset:16
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v8i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[4:5]
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[4:5] offset:16
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[30:31]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[30:31] offset:16
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3720,16 +3721,16 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
   call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> %val)
@@ -3740,9 +3741,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v8i32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -3756,28 +3757,28 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 7
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 8
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v8i32_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 2
@@ -3791,20 +3792,20 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3831,16 +3832,16 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
   ret void
 }
@@ -3849,70 +3850,71 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v16i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v16, 0
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[30:31]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[30:31] offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v16, s[30:31] offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v16, s[30:31] offset:48
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v16i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v16i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v16i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v16, 0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[30:31]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[30:31] offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v16, s[30:31] offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v16, s[30:31] offset:48
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v16i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v16i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3939,16 +3941,16 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
   call amdgpu_gfx void @external_void_func_v16i32(<16 x i32> %val)
@@ -3959,79 +3961,80 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v32i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v28, 0
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[4:5]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[4:5] offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[4:5] offset:48
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[4:5] offset:64
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[4:5] offset:80
-; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[30:31]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[30:31] offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[30:31] offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[30:31] offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[30:31] offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[30:31] offset:80
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[30:31] offset:96
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[30:31] offset:112
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v32i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v32, 0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
-; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
-; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v32, s[30:31]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v32, s[30:31] offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v32, s[30:31] offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v32, s[30:31] offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v32, s[30:31] offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[30:31] offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[30:31] offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[30:31] offset:112
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4062,16 +4065,16 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
   call amdgpu_gfx void @external_void_func_v32i32(<32 x i32> %val)
@@ -4082,86 +4085,87 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v32i32_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v28, 0
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[4:5]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[4:5] offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[4:5] offset:48
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[4:5] offset:64
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[4:5] offset:80
-; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[30:31]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[30:31] offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[30:31] offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[30:31] offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[30:31] offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[30:31] offset:80
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[30:31] offset:96
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[30:31] offset:112
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32 at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    global_load_dword v32, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v32i32_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v32, 0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    global_load_dword v33, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
-; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
-; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v32, s[30:31]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v32, s[30:31] offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v32, s[30:31] offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v32, s[30:31] offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v32, s[30:31] offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[30:31] offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[30:31] offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[30:31] offset:112
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32 at rel32@hi+12
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4195,16 +4199,16 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v33, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
   %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0
   %val1 = load i32, i32 addrspace(1)* undef
@@ -4216,9 +4220,9 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX9-LABEL: test_call_external_i32_func_i32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -4229,32 +4233,32 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v42, v1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_i32_func_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_i32_func_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    global_store_dword v[41:42], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_i32_func_i32_imm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -4264,26 +4268,26 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_mov_b32_e32 v42, v1
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_i32_func_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_i32_func_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    global_store_dword v[41:42], v0, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_i32_func_i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4312,16 +4316,16 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42)
   store volatile i32 %val, i32 addrspace(1)* %out
   ret void
@@ -4331,66 +4335,67 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:4
-; GFX9-NEXT:    global_load_ubyte v0, v2, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    global_load_dword v1, v2, s[30:31] offset:4
+; GFX9-NEXT:    global_load_ubyte v0, v2, s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_ubyte v0, v2, s[4:5]
-; GFX10-NEXT:    global_load_dword v1, v2, s[4:5] offset:4
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_ubyte v0, v2, s[30:31]
+; GFX10-NEXT:    global_load_dword v1, v2, s[30:31] offset:4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_struct_i8_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4415,16 +4420,16 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_struct_i8_i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
   call amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 } %val)
@@ -4435,9 +4440,9 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
@@ -4448,28 +4453,28 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
@@ -4479,21 +4484,21 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4517,16 +4522,16 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = alloca { i8, i32 }, align 4, addrspace(5)
   %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 0
   %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 1
@@ -4540,9 +4545,9 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
@@ -4555,14 +4560,14 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    v_add_u32_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4570,20 +4575,20 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
@@ -4594,17 +4599,17 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v1, 5, s33
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 8, v0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -4612,12 +4617,12 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4646,8 +4651,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    scratch_load_ubyte v0, off, s33 offset:8
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v1, off, s33 offset:12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
@@ -4655,12 +4660,12 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-SCRATCH-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 offset:16 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %in.val = alloca { i8, i32 }, align 4, addrspace(5)
   %out.val = alloca { i8, i32 }, align 4, addrspace(5)
   %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 0
@@ -4682,21 +4687,21 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v16i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v16i8 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v16i8 at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
@@ -4716,38 +4721,38 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v16
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v18
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v16i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[4:5]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[30:31]
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v16i8 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v16i8 at rel32@hi+12
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
@@ -4767,17 +4772,17 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v16
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v18
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i8:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4820,16 +4825,16 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef
   %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
   call amdgpu_gfx void @external_void_func_v16i8(<16 x i8> %val)
@@ -4840,44 +4845,269 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX9-LABEL: tail_call_byval_align16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 30
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:8
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:12
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s40, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s41, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s42, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s43, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s44, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s45, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s46, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s47, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 16
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 17
+; GFX9-NEXT:    v_writelane_b32 v40, s52, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s53, 19
+; GFX9-NEXT:    v_writelane_b32 v40, s54, 20
+; GFX9-NEXT:    v_writelane_b32 v40, s55, 21
+; GFX9-NEXT:    v_writelane_b32 v40, s56, 22
+; GFX9-NEXT:    v_writelane_b32 v40, s57, 23
+; GFX9-NEXT:    v_writelane_b32 v40, s58, 24
+; GFX9-NEXT:    v_writelane_b32 v40, s59, 25
+; GFX9-NEXT:    v_writelane_b32 v40, s60, 26
+; GFX9-NEXT:    v_writelane_b32 v40, s61, 27
+; GFX9-NEXT:    s_addk_i32 s32, 0x800
+; GFX9-NEXT:    v_writelane_b32 v40, s62, 28
+; GFX9-NEXT:    v_writelane_b32 v40, s63, 29
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, byval_align16_f64_arg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, byval_align16_f64_arg at rel32@hi+12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s63, v40, 29
+; GFX9-NEXT:    v_readlane_b32 s62, v40, 28
+; GFX9-NEXT:    v_readlane_b32 s61, v40, 27
+; GFX9-NEXT:    v_readlane_b32 s60, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s59, v40, 25
+; GFX9-NEXT:    v_readlane_b32 s58, v40, 24
+; GFX9-NEXT:    v_readlane_b32 s57, v40, 23
+; GFX9-NEXT:    v_readlane_b32 s56, v40, 22
+; GFX9-NEXT:    v_readlane_b32 s55, v40, 21
+; GFX9-NEXT:    v_readlane_b32 s54, v40, 20
+; GFX9-NEXT:    v_readlane_b32 s53, v40, 19
+; GFX9-NEXT:    v_readlane_b32 s52, v40, 18
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s47, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s46, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s45, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s44, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s43, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s42, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s41, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s40, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX9-NEXT:    s_addk_i32 s32, 0xf800
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 30
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GFX10-LABEL: tail_call_byval_align16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 30
+; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    s_addk_i32 s32, 0x400
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX10-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, byval_align16_f64_arg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, byval_align16_f64_arg at rel32@hi+12
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
+; GFX10-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s40, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s41, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s42, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s43, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s44, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s45, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s46, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s47, 13
+; GFX10-NEXT:    v_writelane_b32 v40, s48, 14
+; GFX10-NEXT:    v_writelane_b32 v40, s49, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s50, 16
+; GFX10-NEXT:    v_writelane_b32 v40, s51, 17
+; GFX10-NEXT:    v_writelane_b32 v40, s52, 18
+; GFX10-NEXT:    v_writelane_b32 v40, s53, 19
+; GFX10-NEXT:    v_writelane_b32 v40, s54, 20
+; GFX10-NEXT:    v_writelane_b32 v40, s55, 21
+; GFX10-NEXT:    v_writelane_b32 v40, s56, 22
+; GFX10-NEXT:    v_writelane_b32 v40, s57, 23
+; GFX10-NEXT:    v_writelane_b32 v40, s58, 24
+; GFX10-NEXT:    v_writelane_b32 v40, s59, 25
+; GFX10-NEXT:    v_writelane_b32 v40, s60, 26
+; GFX10-NEXT:    v_writelane_b32 v40, s61, 27
+; GFX10-NEXT:    v_writelane_b32 v40, s62, 28
+; GFX10-NEXT:    v_writelane_b32 v40, s63, 29
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s63, v40, 29
+; GFX10-NEXT:    v_readlane_b32 s62, v40, 28
+; GFX10-NEXT:    v_readlane_b32 s61, v40, 27
+; GFX10-NEXT:    v_readlane_b32 s60, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s59, v40, 25
+; GFX10-NEXT:    v_readlane_b32 s58, v40, 24
+; GFX10-NEXT:    v_readlane_b32 s57, v40, 23
+; GFX10-NEXT:    v_readlane_b32 s56, v40, 22
+; GFX10-NEXT:    v_readlane_b32 s55, v40, 21
+; GFX10-NEXT:    v_readlane_b32 s54, v40, 20
+; GFX10-NEXT:    v_readlane_b32 s53, v40, 19
+; GFX10-NEXT:    v_readlane_b32 s52, v40, 18
+; GFX10-NEXT:    v_readlane_b32 s51, v40, 17
+; GFX10-NEXT:    v_readlane_b32 s50, v40, 16
+; GFX10-NEXT:    v_readlane_b32 s49, v40, 15
+; GFX10-NEXT:    v_readlane_b32 s48, v40, 14
+; GFX10-NEXT:    v_readlane_b32 s47, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s46, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s45, v40, 11
+; GFX10-NEXT:    v_readlane_b32 s44, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s43, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s42, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s41, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s40, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 30
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GFX10-SCRATCH-LABEL: tail_call_byval_align16:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-SCRATCH-NEXT:    scratch_load_dwordx2 v[32:33], off, s32 offset:8
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 offset:16 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 30
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
+; GFX10-SCRATCH-NEXT:    scratch_load_dwordx2 v[32:33], off, s33 offset:8
+; GFX10-SCRATCH-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, byval_align16_f64_arg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, byval_align16_f64_arg at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s41, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s42, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s43, 9
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s44, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s45, 11
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s46, 12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s47, 13
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 14
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 15
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 17
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 18
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 19
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 20
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 21
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s56, 22
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s57, 23
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s58, 24
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s59, 25
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s60, 26
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s61, 27
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s62, 28
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s63, 29
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s63, v40, 29
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s62, v40, 28
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s61, v40, 27
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s60, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s59, v40, 25
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s58, v40, 24
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s57, v40, 23
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s56, v40, 22
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 21
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 20
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 19
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 18
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 17
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 16
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 15
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 14
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s47, v40, 13
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s46, v40, 12
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s45, v40, 11
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s44, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s43, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s42, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s41, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s40, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 30
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca double, align 8, addrspace(5)
   tail call amdgpu_gfx void @byval_align16_f64_arg(<32 x i32> %val, double addrspace(5)* byval(double) align 16 %alloca)
@@ -4889,59 +5119,59 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i1_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_inreg at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i1_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_inreg at rel32@hi+12
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i1_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i1_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_inreg at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i1_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i1_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4962,16 +5192,16 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_i1_inreg(i1 inreg true)
   ret void
 }
@@ -4980,57 +5210,61 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i8_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i8_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i8_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i8_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i8_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_i8_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_i8_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i8_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i8_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5040,26 +5274,28 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i8_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i8_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_i8_inreg(i8 inreg 123)
   ret void
 }
@@ -5068,57 +5304,61 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i16_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i16_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_i16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5128,26 +5368,28 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_i16_inreg(i16 inreg 123)
   ret void
 }
@@ -5156,57 +5398,61 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i32_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s4, 42
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i32_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_mov_b32 s4, 42
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 42
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5216,26 +5462,28 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 42
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 42
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_i32_inreg(i32 inreg 42)
   ret void
 }
@@ -5244,59 +5492,67 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i64_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX9-NEXT:    s_mov_b32 s5, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_i64_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_i64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_i64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_i64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i64_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5306,27 +5562,31 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_i64_inreg(i64 inreg 123)
   ret void
 }
@@ -5335,59 +5595,75 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i64_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    s_mov_b64 s[30:31], 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i64_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[8:9]
-; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    s_mov_b64 s[30:31], 0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5397,27 +5673,35 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i64>, <2 x i64> addrspace(4)* null
   call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg %val)
   ret void
@@ -5427,63 +5711,79 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i64_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_mov_b32 s6, 3
 ; GFX9-NEXT:    s_mov_b32 s7, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i64_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 6
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    s_mov_b32 s6, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[8:9]
-; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5493,29 +5793,37 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg <i64 8589934593, i64 17179869187>)
   ret void
 }
@@ -5524,63 +5832,87 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i64_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX9-NEXT:    s_mov_b64 s[30:31], 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s8, 1
 ; GFX9-NEXT:    s_mov_b32 s9, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[10:11]
-; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v3i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v3i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_getpc_b64 s[34:35]
+; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v3i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v3i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 8
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i64_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    s_mov_b32 s8, 1
-; GFX10-NEXT:    s_mov_b32 s9, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 8
 ; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[10:11]
-; GFX10-NEXT:    s_add_u32 s10, s10, external_void_func_v3i64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s11, s11, external_void_func_v3i64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[10:11]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-NEXT:    s_mov_b32 s8, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX10-NEXT:    s_mov_b32 s9, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX10-NEXT:    s_mov_b64 s[30:31], 0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 8
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i64_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5590,29 +5922,41 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 8
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 8
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %load = load <2 x i64>, <2 x i64> addrspace(4)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
 
@@ -5624,67 +5968,99 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i64_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX9-NEXT:    s_mov_b64 s[30:31], 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s8, 1
 ; GFX9-NEXT:    s_mov_b32 s9, 2
 ; GFX9-NEXT:    s_mov_b32 s10, 3
 ; GFX9-NEXT:    s_mov_b32 s11, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[12:13]
-; GFX9-NEXT:    s_add_u32 s12, s12, external_void_func_v4i64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s13, s13, external_void_func_v4i64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[12:13]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_getpc_b64 s[34:35]
+; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v4i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v4i64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i64_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 10
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-NEXT:    s_mov_b32 s8, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-NEXT:    s_mov_b32 s9, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX10-NEXT:    s_mov_b32 s10, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX10-NEXT:    s_mov_b32 s11, 4
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[12:13]
-; GFX10-NEXT:    s_add_u32 s12, s12, external_void_func_v4i64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s13, s13, external_void_func_v4i64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX10-NEXT:    s_mov_b64 s[30:31], 0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i64_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5694,31 +6070,47 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 10
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s10, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s11, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s10, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s11, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %load = load <2 x i64>, <2 x i64> addrspace(4)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> inreg %val)
@@ -5729,57 +6121,61 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f16_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x4400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_f16_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_movk_i32 s4, 0x4400
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_f16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_f16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_movk_i32 s4, 0x4400
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f16_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5789,26 +6185,28 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x4400
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_f16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_f16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x4400
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_f16_inreg(half inreg 4.0)
   ret void
 }
@@ -5817,57 +6215,61 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f32_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s4, 4.0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_f32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_f32_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_mov_b32 s4, 4.0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_f32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_f32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 4.0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5877,26 +6279,28 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 4.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_f32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 4.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_f32_inreg(float inreg 4.0)
   ret void
 }
@@ -5905,59 +6309,67 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f32_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 1.0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2f32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f32_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v2f32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v2f32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 1.0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5967,27 +6379,31 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2f32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> inreg <float 1.0, float 2.0>)
   ret void
 }
@@ -5996,61 +6412,73 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f32_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX9-NEXT:    s_mov_b32 s4, 1.0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    s_mov_b32 s6, 4.0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v3f32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 5
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f32_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 5
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1.0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    s_mov_b32 s6, 4.0
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[8:9]
-; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v3f32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 5
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -6060,28 +6488,34 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 4.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 5
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 4.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 5
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> inreg <float 1.0, float 2.0, float 4.0>)
   ret void
 }
@@ -6090,65 +6524,85 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v5f32_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX9-NEXT:    s_mov_b32 s4, 1.0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    s_mov_b32 s6, 4.0
 ; GFX9-NEXT:    s_mov_b32 s7, -1.0
 ; GFX9-NEXT:    s_mov_b32 s8, 0.5
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[10:11]
-; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v5f32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v5f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v5f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v5f32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 7
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v5f32_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 7
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1.0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    s_mov_b32 s6, 4.0
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX10-NEXT:    s_mov_b32 s7, -1.0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-NEXT:    s_mov_b32 s8, 0.5
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[10:11]
-; GFX10-NEXT:    s_add_u32 s10, s10, external_void_func_v5f32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s11, s11, external_void_func_v5f32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[10:11]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v5f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v5f32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 7
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5f32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -6158,30 +6612,40 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 4.0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, -1.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 0.5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 7
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v5f32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v5f32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 4.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, -1.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 0.5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 7
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> inreg <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
   ret void
 }
@@ -6190,59 +6654,67 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f64_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    s_mov_b32 s5, 0x40100000
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_f64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_f64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_f64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_f64_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40100000
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_f64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_f64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, 0x40100000
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_f64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_f64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f64_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -6252,27 +6724,31 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40100000
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_f64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_f64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40100000
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_f64_inreg(double inreg 4.0)
   ret void
 }
@@ -6281,63 +6757,79 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f64_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v2f64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f64_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 6
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX10-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[8:9]
-; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v2f64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f64_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -6347,29 +6839,37 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2f64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> inreg <double 2.0, double 4.0>)
   ret void
 }
@@ -6378,67 +6878,91 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f64_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 6
 ; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
 ; GFX9-NEXT:    s_mov_b32 s8, 0
 ; GFX9-NEXT:    s_mov_b32 s9, 0x40200000
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[10:11]
-; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v3f64_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f64_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 8
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f64_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 8
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX10-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-NEXT:    s_mov_b32 s9, 0x40200000
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[10:11]
-; GFX10-NEXT:    s_add_u32 s10, s10, external_void_func_v3f64_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s11, s11, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[10:11]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f64_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 8
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f64_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -6448,31 +6972,43 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 8
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f64_inreg at rel32@lo+4
+; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f64_inreg at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 0x40200000
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
-; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f64_inreg at rel32@lo+4
-; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f64_inreg at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 8
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> inreg <double 2.0, double 4.0, double 8.0>)
   ret void
 }
@@ -6481,57 +7017,61 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i16_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_load_dword s4, s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i16_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v2i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    s_load_dword s4, s[30:31], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -6541,26 +7081,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i16>, <2 x i16> addrspace(4)* undef
   call amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> inreg %val)
   ret void
@@ -6570,57 +7112,65 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i16_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i16_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v3i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -6630,26 +7180,30 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <3 x i16>, <3 x i16> addrspace(4)* undef
   call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg %val)
   ret void
@@ -6659,57 +7213,65 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f16_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f16_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v3f16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -6719,26 +7281,30 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <3 x half>, <3 x half> addrspace(4)* undef
   call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg %val)
   ret void
@@ -6748,59 +7314,67 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i16_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX9-NEXT:    s_mov_b32 s5, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i16_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-NEXT:    s_mov_b32 s5, 3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v3i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0x20001
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -6810,27 +7384,31 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg <i16 1, i16 2, i16 3>)
   ret void
 }
@@ -6839,59 +7417,67 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f16_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x40003c00
 ; GFX9-NEXT:    s_movk_i32 s5, 0x4400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f16_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_mov_b32 s4, 0x40003c00
-; GFX10-NEXT:    s_movk_i32 s5, 0x4400
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v3f16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0x40003c00
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_movk_i32 s5, 0x4400
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -6901,27 +7487,31 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x40003c00
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s5, 0x4400
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x40003c00
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s5, 0x4400
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg <half 1.0, half 2.0, half 4.0>)
   ret void
 }
@@ -6930,57 +7520,65 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i16_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v4i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i16_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v4i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -6990,26 +7588,30 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <4 x i16>, <4 x i16> addrspace(4)* undef
   call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg %val)
   ret void
@@ -7019,59 +7621,67 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i16_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX9-NEXT:    s_mov_b32 s5, 0x40003
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v4i16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i16_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-NEXT:    s_mov_b32 s5, 0x40003
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v4i16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0x20001
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, 0x40003
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -7081,27 +7691,31 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40003
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40003
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg <i16 1, i16 2, i16 3, i16 4>)
   ret void
 }
@@ -7110,57 +7724,61 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f16_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_load_dword s4, s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2f16_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f16_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f16_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v2f16_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    s_load_dword s4, s[30:31], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2f16_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -7170,26 +7788,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2f16_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x half>, <2 x half> addrspace(4)* undef
   call amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> inreg %val)
   ret void
@@ -7199,57 +7819,65 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i32_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i32_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v2i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[30:31], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -7259,26 +7887,30 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i32>, <2 x i32> addrspace(4)* undef
   call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg %val)
   ret void
@@ -7288,59 +7920,67 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i32_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i32_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-NEXT:    s_mov_b32 s4, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v2i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -7350,27 +7990,31 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v2i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg <i32 1, i32 2>)
   ret void
 }
@@ -7379,61 +8023,73 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i32_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 3
 ; GFX9-NEXT:    s_mov_b32 s4, 3
 ; GFX9-NEXT:    s_mov_b32 s5, 4
 ; GFX9-NEXT:    s_mov_b32 s6, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 5
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i32_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 5
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    s_mov_b32 s6, 5
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[8:9]
-; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 5
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -7443,28 +8099,34 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 5
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 5
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> inreg <i32 3, i32 4, i32 5>)
   ret void
 }
@@ -7473,63 +8135,79 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i32_i32_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s4, 3
 ; GFX9-NEXT:    s_mov_b32 s5, 4
 ; GFX9-NEXT:    s_mov_b32 s6, 5
 ; GFX9-NEXT:    s_mov_b32 s7, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i32_i32_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 6
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    s_mov_b32 s6, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX10-NEXT:    s_mov_b32 s7, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[8:9]
-; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v3i32_i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v3i32_i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -7539,29 +8217,37 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> inreg <i32 3, i32 4, i32 5>, i32 inreg 6)
   ret void
 }
@@ -7570,57 +8256,73 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i32_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i32_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[8:9]
-; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[30:31], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -7630,26 +8332,34 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = load <4 x i32>, <4 x i32> addrspace(4)* undef
   call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg %val)
   ret void
@@ -7659,63 +8369,79 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i32_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_mov_b32 s6, 3
 ; GFX9-NEXT:    s_mov_b32 s7, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i32_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 6
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    s_mov_b32 s6, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[8:9]
-; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -7725,29 +8451,37 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 6
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v4i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg <i32 1, i32 2, i32 3, i32 4>)
   ret void
 }
@@ -7756,65 +8490,85 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v5i32_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 5
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_mov_b32 s6, 3
 ; GFX9-NEXT:    s_mov_b32 s7, 4
 ; GFX9-NEXT:    s_mov_b32 s8, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[10:11]
-; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v5i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v5i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v5i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v5i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 7
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v5i32_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 7
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    s_mov_b32 s6, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-NEXT:    s_mov_b32 s8, 5
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[10:11]
-; GFX10-NEXT:    s_add_u32 s10, s10, external_void_func_v5i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s11, s11, external_void_func_v5i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[10:11]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v5i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v5i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 7
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -7824,30 +8578,40 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 7
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v5i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v5i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 7
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> inreg <i32 1, i32 2, i32 3, i32 4, i32 5>)
   ret void
 }
@@ -7856,61 +8620,93 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v8i32_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[34:35]
+; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v8i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[12:13]
-; GFX9-NEXT:    s_add_u32 s12, s12, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s13, s13, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[12:13]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[30:31], 0x0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v8i32_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 10
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[12:13]
-; GFX10-NEXT:    s_add_u32 s12, s12, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s13, s13, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -7920,28 +8716,44 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 10
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load <8 x i32> addrspace(4)*, <8 x i32> addrspace(4)* addrspace(4)* undef
   %val = load <8 x i32>, <8 x i32> addrspace(4)* %ptr
   call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg %val)
@@ -7952,13 +8764,21 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v8i32_imm_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_mov_b32 s6, 3
@@ -7967,56 +8787,80 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s9, 6
 ; GFX9-NEXT:    s_mov_b32 s10, 7
 ; GFX9-NEXT:    s_mov_b32 s11, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[12:13]
-; GFX9-NEXT:    s_add_u32 s12, s12, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s13, s13, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[12:13]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v8i32_imm_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 10
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    s_mov_b32 s6, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-NEXT:    s_mov_b32 s8, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-NEXT:    s_mov_b32 s9, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX10-NEXT:    s_mov_b32 s10, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX10-NEXT:    s_mov_b32 s11, 8
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[12:13]
-; GFX10-NEXT:    s_add_u32 s12, s12, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s13, s13, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -8026,33 +8870,49 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 10
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s10, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s11, 8
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
-; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
   ret void
 }
@@ -8061,61 +8921,125 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v16i32_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s12, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[34:35]
+; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_v16i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_v16i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[20:21]
-; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v16i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v16i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s15, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s14, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s13, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s12, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 18
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v16i32_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 18
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[20:21]
-; GFX10-NEXT:    s_add_u32 s20, s20, external_void_func_v16i32_inreg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s21, s21, external_void_func_v16i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s12, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s14, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s15, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s16, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
+; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
+; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v16i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v16i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
+; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
+; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
+; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s15, v40, 11
+; GFX10-NEXT:    v_readlane_b32 s14, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s13, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s12, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 18
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -8125,28 +9049,60 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 18
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s12, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s14, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s15, 11
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s16, 12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v16i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32_inreg at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 16
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s15, v40, 11
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s14, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s13, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s12, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 18
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load <16 x i32> addrspace(4)*, <16 x i32> addrspace(4)* addrspace(4)* undef
   %val = load <16 x i32>, <16 x i32> addrspace(4)* %ptr
   call amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> inreg %val)
@@ -8157,34 +9113,47 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v32i32_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 18
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s38, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s39, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s40, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s41, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s42, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s43, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s44, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s45, 9
-; GFX9-NEXT:    v_writelane_b32 v40, s46, 10
-; GFX9-NEXT:    s_load_dwordx2 s[20:21], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s47, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s48, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s49, 13
-; GFX9-NEXT:    v_writelane_b32 v40, s50, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s51, 15
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
-; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[20:21], 0x40
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 28
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s12, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
+; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
+; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
+; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
+; GFX9-NEXT:    v_writelane_b32 v40, s25, 21
+; GFX9-NEXT:    v_writelane_b32 v40, s26, 22
+; GFX9-NEXT:    v_writelane_b32 v40, s27, 23
+; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
+; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[30:31], 0x40
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s46
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s47
@@ -8208,70 +9177,87 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s28, s44
 ; GFX9-NEXT:    s_mov_b32 s29, s45
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_getpc_b64 s[30:31]
-; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 16
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 17
-; GFX9-NEXT:    v_readlane_b32 s51, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s50, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s49, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s48, v40, 12
-; GFX9-NEXT:    v_readlane_b32 s47, v40, 11
-; GFX9-NEXT:    v_readlane_b32 s46, v40, 10
-; GFX9-NEXT:    v_readlane_b32 s45, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s44, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s43, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s42, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s41, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s40, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s39, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s38, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
+; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
+; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
+; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
+; GFX9-NEXT:    v_readlane_b32 s26, v40, 22
+; GFX9-NEXT:    v_readlane_b32 s25, v40, 21
+; GFX9-NEXT:    v_readlane_b32 s24, v40, 20
+; GFX9-NEXT:    v_readlane_b32 s23, v40, 19
+; GFX9-NEXT:    v_readlane_b32 s22, v40, 18
+; GFX9-NEXT:    v_readlane_b32 s21, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s20, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s15, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s14, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s13, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s12, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 18
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 28
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v32i32_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 18
-; GFX10-NEXT:    s_load_dwordx2 s[20:21], s[4:5], 0x0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 28
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s36, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s37, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s38, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s39, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s40, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s41, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s42, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s43, 7
-; GFX10-NEXT:    v_writelane_b32 v40, s44, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s45, 9
-; GFX10-NEXT:    v_writelane_b32 v40, s46, 10
-; GFX10-NEXT:    v_writelane_b32 v40, s47, 11
-; GFX10-NEXT:    v_writelane_b32 v40, s48, 12
-; GFX10-NEXT:    v_writelane_b32 v40, s49, 13
-; GFX10-NEXT:    v_writelane_b32 v40, s50, 14
-; GFX10-NEXT:    v_writelane_b32 v40, s51, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s12, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s14, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s15, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s16, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
+; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
+; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
+; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
+; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
+; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
+; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
+; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
+; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
+; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
+; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
+; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[20:21], 0x40
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[30:31], 0x40
+; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
 ; GFX10-NEXT:    s_getpc_b64 s[30:31]
 ; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_inreg at rel32@hi+12
@@ -8299,32 +9285,42 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
 ; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 16
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 17
-; GFX10-NEXT:    v_readlane_b32 s51, v40, 15
-; GFX10-NEXT:    v_readlane_b32 s50, v40, 14
-; GFX10-NEXT:    v_readlane_b32 s49, v40, 13
-; GFX10-NEXT:    v_readlane_b32 s48, v40, 12
-; GFX10-NEXT:    v_readlane_b32 s47, v40, 11
-; GFX10-NEXT:    v_readlane_b32 s46, v40, 10
-; GFX10-NEXT:    v_readlane_b32 s45, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s44, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s43, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s42, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s41, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s40, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s39, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s38, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s37, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s36, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
+; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
+; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
+; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
+; GFX10-NEXT:    v_readlane_b32 s26, v40, 22
+; GFX10-NEXT:    v_readlane_b32 s25, v40, 21
+; GFX10-NEXT:    v_readlane_b32 s24, v40, 20
+; GFX10-NEXT:    v_readlane_b32 s23, v40, 19
+; GFX10-NEXT:    v_readlane_b32 s22, v40, 18
+; GFX10-NEXT:    v_readlane_b32 s21, v40, 17
+; GFX10-NEXT:    v_readlane_b32 s20, v40, 16
+; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
+; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
+; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s15, v40, 11
+; GFX10-NEXT:    v_readlane_b32 s14, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s13, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s12, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 18
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 28
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -8334,26 +9330,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 18
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 28
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s40, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s41, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s42, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s43, 7
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s44, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s45, 9
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s46, 10
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s47, 11
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 13
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 14
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 15
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s12, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s14, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s15, 11
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s16, 12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x40
@@ -8361,54 +9357,74 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s20, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s21, 17
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s46
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s47
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s48
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s24, 20
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s49
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, s36
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, s37
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, s38
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s27, 23
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s27, s43
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s28, 24
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s28, s44
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s29, 25
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s29, s45
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 17
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 15
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 14
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 13
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s47, v40, 11
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s46, v40, 10
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s45, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s44, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s43, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s42, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s41, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s40, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s26, v40, 22
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s25, v40, 21
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s24, v40, 20
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s23, v40, 19
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s22, v40, 18
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s21, v40, 17
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s20, v40, 16
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s15, v40, 11
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s14, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s13, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s12, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 18
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 28
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef
   %val = load <32 x i32>, <32 x i32> addrspace(4)* %ptr
   call amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> inreg %val)
@@ -8419,36 +9435,53 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v32i32_i32_inreg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 18
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s38, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s39, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s40, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s41, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s42, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s43, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s44, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s45, 9
-; GFX9-NEXT:    v_writelane_b32 v40, s46, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s47, 11
-; GFX9-NEXT:    s_load_dwordx2 s[20:21], s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s22, s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s48, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s49, 13
-; GFX9-NEXT:    v_writelane_b32 v40, s50, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s51, 15
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
-; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[20:21], 0x40
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 28
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s12, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
+; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
+; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
+; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
+; GFX9-NEXT:    v_writelane_b32 v40, s25, 21
+; GFX9-NEXT:    v_writelane_b32 v40, s26, 22
+; GFX9-NEXT:    v_writelane_b32 v40, s27, 23
+; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
+; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_mov_b32_e32 v0, s22
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s34, s[30:31], 0x0
+; GFX9-NEXT:    ; kill: killed $sgpr30_sgpr31
+; GFX9-NEXT:    ; kill: killed $sgpr30_sgpr31
+; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[30:31], 0x40
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s34
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s46
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s47
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
@@ -8458,7 +9491,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s49
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s50
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s51
 ; GFX9-NEXT:    s_mov_b32 s20, s36
@@ -8471,83 +9503,100 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s27, s43
 ; GFX9-NEXT:    s_mov_b32 s28, s44
 ; GFX9-NEXT:    s_mov_b32 s29, s45
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_getpc_b64 s[30:31]
-; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg at rel32@hi+12
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 16
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 17
-; GFX9-NEXT:    v_readlane_b32 s51, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s50, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s49, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s48, v40, 12
-; GFX9-NEXT:    v_readlane_b32 s47, v40, 11
-; GFX9-NEXT:    v_readlane_b32 s46, v40, 10
-; GFX9-NEXT:    v_readlane_b32 s45, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s44, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s43, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s42, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s41, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s40, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s39, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s38, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
+; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
+; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
+; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
+; GFX9-NEXT:    v_readlane_b32 s26, v40, 22
+; GFX9-NEXT:    v_readlane_b32 s25, v40, 21
+; GFX9-NEXT:    v_readlane_b32 s24, v40, 20
+; GFX9-NEXT:    v_readlane_b32 s23, v40, 19
+; GFX9-NEXT:    v_readlane_b32 s22, v40, 18
+; GFX9-NEXT:    v_readlane_b32 s21, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s20, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s15, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s14, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s13, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s12, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 18
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 28
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_v32i32_i32_inreg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 18
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[20:21], s[4:5], 0x0
-; GFX10-NEXT:    s_load_dword s22, s[4:5], 0x0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 28
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s36, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s37, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s38, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s39, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s12, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s14, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s15, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s16, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
+; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
+; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
+; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
+; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
+; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
+; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
+; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
+; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
+; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
+; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
+; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s22
-; GFX10-NEXT:    v_writelane_b32 v40, s40, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s41, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s42, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s43, 7
-; GFX10-NEXT:    v_writelane_b32 v40, s44, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s45, 9
-; GFX10-NEXT:    v_writelane_b32 v40, s46, 10
-; GFX10-NEXT:    v_writelane_b32 v40, s47, 11
-; GFX10-NEXT:    v_writelane_b32 v40, s48, 12
-; GFX10-NEXT:    v_writelane_b32 v40, s49, 13
-; GFX10-NEXT:    v_writelane_b32 v40, s50, 14
-; GFX10-NEXT:    v_writelane_b32 v40, s51, 15
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[20:21], 0x40
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dword s34, s[30:31], 0x0
+; GFX10-NEXT:    ; meta instruction
+; GFX10-NEXT:    ; meta instruction
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[30:31], 0x40
+; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
 ; GFX10-NEXT:    s_getpc_b64 s[30:31]
 ; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg at rel32@hi+12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s46
+; GFX10-NEXT:    v_mov_b32_e32 v0, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s47
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s48
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    v_mov_b32_e32 v0, s46
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s49
 ; GFX10-NEXT:    s_mov_b32 s20, s36
 ; GFX10-NEXT:    s_mov_b32 s21, s37
@@ -8568,32 +9617,42 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
 ; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 16
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 17
-; GFX10-NEXT:    v_readlane_b32 s51, v40, 15
-; GFX10-NEXT:    v_readlane_b32 s50, v40, 14
-; GFX10-NEXT:    v_readlane_b32 s49, v40, 13
-; GFX10-NEXT:    v_readlane_b32 s48, v40, 12
-; GFX10-NEXT:    v_readlane_b32 s47, v40, 11
-; GFX10-NEXT:    v_readlane_b32 s46, v40, 10
-; GFX10-NEXT:    v_readlane_b32 s45, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s44, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s43, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s42, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s41, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s40, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s39, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s38, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s37, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s36, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
+; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
+; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
+; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
+; GFX10-NEXT:    v_readlane_b32 s26, v40, 22
+; GFX10-NEXT:    v_readlane_b32 s25, v40, 21
+; GFX10-NEXT:    v_readlane_b32 s24, v40, 20
+; GFX10-NEXT:    v_readlane_b32 s23, v40, 19
+; GFX10-NEXT:    v_readlane_b32 s22, v40, 18
+; GFX10-NEXT:    v_readlane_b32 s21, v40, 17
+; GFX10-NEXT:    v_readlane_b32 s20, v40, 16
+; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
+; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
+; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s15, v40, 11
+; GFX10-NEXT:    v_readlane_b32 s14, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s13, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s12, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 18
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 28
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -8603,26 +9662,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 18
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 28
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s40, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s41, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s42, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s43, 7
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s44, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s45, 9
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s46, 10
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s47, 11
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 13
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 14
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 15
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s12, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s14, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s15, 11
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s16, 12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_clause 0x2
 ; GFX10-SCRATCH-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -8633,56 +9692,76 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s20, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s21, 17
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s46
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s47
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s48
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s24, 20
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s49
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, s36
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, s37
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, s38
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v6, s32 offset:24
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s27, 23
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s27, s43
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s28, 24
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s28, s44
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s29, 25
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s29, s45
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v6, s32 offset:24
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 17
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 15
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 14
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 13
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s47, v40, 11
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s46, v40, 10
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s45, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s44, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s43, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s42, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s41, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s40, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s26, v40, 22
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s25, v40, 21
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s24, v40, 20
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s23, v40, 19
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s22, v40, 18
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s21, v40, 17
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s20, v40, 16
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s15, v40, 11
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s14, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s13, v40, 9
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s12, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 18
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 28
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %ptr0 = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef
   %val0 = load <32 x i32>, <32 x i32> addrspace(4)* %ptr0
   %val1 = load i32, i32 addrspace(4)* undef
@@ -8694,9 +9773,9 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33
@@ -8704,32 +9783,32 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, stack_passed_f64_arg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, stack_passed_f64_arg at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_clause 0x1
@@ -8737,25 +9816,25 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, stack_passed_f64_arg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, stack_passed_f64_arg at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
@@ -8777,16 +9856,16 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 entry:
   call amdgpu_gfx void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
   ret void
@@ -8796,9 +9875,9 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-LABEL: stack_12xv3i32:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -8844,28 +9923,28 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 11
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_12xv3i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_12xv3i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: stack_12xv3i32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 12
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 13
@@ -8911,20 +9990,20 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 11
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_12xv3i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_12xv3i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: stack_12xv3i32:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
@@ -8980,16 +10059,16 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 entry:
   call amdgpu_gfx void @external_void_func_12xv3i32(
       <3 x i32><i32 0, i32 0, i32 0>,
@@ -9011,9 +10090,9 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-LABEL: stack_8xv5i32:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -9067,28 +10146,28 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 7
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_8xv5i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_8xv5i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: stack_8xv5i32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 10
@@ -9142,20 +10221,20 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 7
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_8xv5i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_8xv5i32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: stack_8xv5i32:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
@@ -9216,16 +10295,16 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 entry:
   call amdgpu_gfx void @external_void_func_8xv5i32(
       <5 x i32><i32 0, i32 0, i32 0, i32 0, i32 0>,
@@ -9243,9 +10322,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-LABEL: stack_8xv5f32:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -9299,28 +10378,28 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0x40e00000
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_8xv5f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_8xv5f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: stack_8xv5f32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41000000
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41100000
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0x41200000
@@ -9374,20 +10453,20 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 0x40e00000
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_8xv5f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_8xv5f32 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: stack_8xv5f32:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
@@ -9448,16 +10527,16 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5f32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[0:1]
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 entry:
   call amdgpu_gfx void @external_void_func_8xv5f32(
       <5 x float><float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>,

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 2f5f1485c98c..fa8f51cf412f 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -8,69 +8,69 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX9-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s35, 1
-; GFX10-NEXT:    s_getpc_b64 s[34:35]
-; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s35, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @external_void_func_void()
   call void asm sideeffect "", ""() #0
   call amdgpu_gfx void @external_void_func_void()
@@ -81,21 +81,21 @@ define amdgpu_gfx void @void_func_void_clobber_s30_s31() #1 {
 ; GFX9-LABEL: void_func_void_clobber_s30_s31:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; clobber
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
 ; GFX10-LABEL: void_func_void_clobber_s30_s31:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; clobber
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
   call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
   ret void
 }
@@ -104,75 +104,75 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
 ; GFX9-LABEL: test_call_void_func_void_mayclobber_s31:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s31
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s34, s31
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
-; GFX9-NEXT:    s_mov_b32 s31, s34
+; GFX9-NEXT:    s_mov_b32 s4, s31
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    s_mov_b32 s31, s4
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s31
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_void_func_void_mayclobber_s31:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX10-NEXT:    s_getpc_b64 s[34:35]
+; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    s_mov_b32 s34, s31
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
-; GFX10-NEXT:    s_mov_b32 s31, s34
+; GFX10-NEXT:    s_mov_b32 s4, s31
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_mov_b32 s31, s4
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %s31 = call i32 asm sideeffect "; def $0", "={s31}"()
   call amdgpu_gfx void @external_void_func_void()
   call void asm sideeffect "; use $0", "{s31}"(i32 %s31)
@@ -183,9 +183,9 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
 ; GFX9-LABEL: test_call_void_func_void_mayclobber_v31:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -196,33 +196,33 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
 ; GFX9-NEXT:    ; def v31
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v31
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v41
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v31
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_void_func_void_mayclobber_v31:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
@@ -232,26 +232,26 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, v31
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    v_mov_b32_e32 v31, v41
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use v31
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %v31 = call i32 asm sideeffect "; def $0", "={v31}"()
   call amdgpu_gfx void @external_void_func_void()
   call void asm sideeffect "; use $0", "{v31}"(i32 %v31)
@@ -263,67 +263,75 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
 ; GFX9-LABEL: test_call_void_func_void_preserves_s33:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s33
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s4, s33
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s33
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_void_func_void_preserves_s33:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s33
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, s33
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    s_mov_b32 s33, s4
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s33
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %s33 = call i32 asm sideeffect "; def $0", "={s33}"()
   call amdgpu_gfx void @external_void_func_void()
   call void asm sideeffect "; use $0", "{s33}"(i32 %s33)
@@ -334,11 +342,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
 ; GFX9-LABEL: test_call_void_func_void_preserves_s34:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
@@ -346,59 +354,63 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s34
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX9-NEXT:    s_mov_b32 s4, s34
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    s_mov_b32 s34, s4
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s34
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_void_func_void_preserves_s34:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s34
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    s_mov_b32 s34, s4
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s34
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %s34 = call i32 asm sideeffect "; def $0", "={s34}"()
   call amdgpu_gfx void @external_void_func_void()
   call void asm sideeffect "; use $0", "{s34}"(i32 %s34)
@@ -409,9 +421,9 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
 ; GFX9-LABEL: test_call_void_func_void_preserves_v40:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v41, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -421,32 +433,32 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def v40
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v40
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v41, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_void_func_void_preserves_v40:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v41, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
@@ -455,25 +467,25 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
 ; GFX10-NEXT:    ; def v40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use v40
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v41, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %v40 = call i32 asm sideeffect "; def $0", "={v40}"()
   call amdgpu_gfx void @external_void_func_void()
   call void asm sideeffect "; use $0", "{v40}"(i32 %v40)
@@ -568,55 +580,55 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX9-LABEL: test_call_void_func_void_clobber_s33:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s33 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s33 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, void_func_void_clobber_s33 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, void_func_void_clobber_s33 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_void_func_void_clobber_s33:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s33 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s33 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, void_func_void_clobber_s33 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, void_func_void_clobber_s33 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @void_func_void_clobber_s33()
   ret void
 }
@@ -625,55 +637,55 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX9-LABEL: test_call_void_func_void_clobber_s34:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s34 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s34 at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, void_func_void_clobber_s34 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, void_func_void_clobber_s34 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_void_func_void_clobber_s34:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s34 at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s34 at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, void_func_void_clobber_s34 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, void_func_void_clobber_s34 at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void @void_func_void_clobber_s34()
   ret void
 }
@@ -682,11 +694,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX9-LABEL: callee_saved_sgpr_kernel:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
@@ -694,59 +706,61 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s40
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX9-NEXT:    s_mov_b32 s4, s40
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s40
+; GFX9-NEXT:    ; use s4
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s40, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: callee_saved_sgpr_kernel:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s40, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s40
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, s40
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    ;;#ASMSTART
-; GFX10-NEXT:    ; use s40
+; GFX10-NEXT:    ; use s4
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s40, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
   call amdgpu_gfx void @external_void_func_void()
   call void asm sideeffect "; use $0", "s"(i32 %s40) #0
@@ -757,11 +771,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-LABEL: callee_saved_sgpr_vgpr_kernel:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
@@ -770,76 +784,78 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s40
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s4, s40
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def v32
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v32
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s40
+; GFX9-NEXT:    ; use s4
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v41
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s40, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: callee_saved_sgpr_vgpr_kernel:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s40, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s40
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, s40
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def v32
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_mov_b32_e32 v41, v32
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    ;;#ASMSTART
-; GFX10-NEXT:    ; use s40
+; GFX10-NEXT:    ; use s4
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use v41
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s40, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
-; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
   %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
   call amdgpu_gfx void @external_void_func_void()

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index d5c769a9f12e..7eca95becc00 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -23,27 +23,27 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX9-LABEL: call_i1:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, return_i1 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, return_i1 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, return_i1 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, return_i1 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
 ; GFX10-LABEL: call_i1:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, return_i1 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, return_i1 at gotpcrel32@hi+12
-; GFX10-NEXT:    s_mov_b64 s[4:5], s[30:31]
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, return_i1 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, return_i1 at gotpcrel32@hi+12
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
 entry:
   call amdgpu_gfx i1 @return_i1()
   ret void
@@ -70,27 +70,27 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX9-LABEL: call_i16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, return_i16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, return_i16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, return_i16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, return_i16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
 ; GFX10-LABEL: call_i16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, return_i16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, return_i16 at gotpcrel32@hi+12
-; GFX10-NEXT:    s_mov_b64 s[4:5], s[30:31]
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, return_i16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, return_i16 at gotpcrel32@hi+12
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
 entry:
   call amdgpu_gfx i16 @return_i16()
   ret void
@@ -117,27 +117,27 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX9-LABEL: call_2xi16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, return_2xi16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, return_2xi16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, return_2xi16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, return_2xi16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
 ; GFX10-LABEL: call_2xi16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, return_2xi16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, return_2xi16 at gotpcrel32@hi+12
-; GFX10-NEXT:    s_mov_b64 s[4:5], s[30:31]
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, return_2xi16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, return_2xi16 at gotpcrel32@hi+12
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
 entry:
   call amdgpu_gfx <2 x i16> @return_2xi16()
   ret void
@@ -166,27 +166,27 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX9-LABEL: call_3xi16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, return_3xi16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, return_3xi16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, return_3xi16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, return_3xi16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
 ; GFX10-LABEL: call_3xi16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, return_3xi16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, return_3xi16 at gotpcrel32@hi+12
-; GFX10-NEXT:    s_mov_b64 s[4:5], s[30:31]
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, return_3xi16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, return_3xi16 at gotpcrel32@hi+12
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
 entry:
   call amdgpu_gfx <3 x i16> @return_3xi16()
   ret void
@@ -1241,41 +1241,41 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ; GFX9-LABEL: call_512xi32:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s33
+; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x1ffc0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffe0000
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x60000
-; GFX9-NEXT:    s_getpc_b64 s[6:7]
-; GFX9-NEXT:    s_add_u32 s6, s6, return_512xi32 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s7, s7, return_512xi32 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, return_512xi32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, return_512xi32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[30:31]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0xfffa0000
-; GFX9-NEXT:    s_mov_b32 s33, s8
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_setpc_b64 s[36:37]
 ;
 ; GFX10-LABEL: call_512xi32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_mov_b32 s8, s33
+; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0xffe0
-; GFX10-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX10-NEXT:    s_mov_b64 s[36:37], s[30:31]
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xffff0000
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x30000
-; GFX10-NEXT:    s_getpc_b64 s[6:7]
-; GFX10-NEXT:    s_add_u32 s6, s6, return_512xi32 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s7, s7, return_512xi32 at gotpcrel32@hi+12
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, return_512xi32 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, return_512xi32 at gotpcrel32@hi+12
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0xfffd0000
-; GFX10-NEXT:    s_mov_b32 s33, s8
-; GFX10-NEXT:    s_setpc_b64 s[4:5]
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_setpc_b64 s[36:37]
 entry:
   call amdgpu_gfx <512 x i32> @return_512xi32()
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 774949915349..43a306d9c594 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -1074,41 +1074,90 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v40, s33, 6
+; GCN-NEXT:    v_writelane_b32 v40, s33, 30
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    v_writelane_b32 v40, s36, 2
 ; GCN-NEXT:    v_writelane_b32 v40, s37, 3
-; GCN-NEXT:    v_writelane_b32 v40, s30, 4
-; GCN-NEXT:    v_writelane_b32 v40, s31, 5
-; GCN-NEXT:    s_mov_b64 s[34:35], exec
-; GCN-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s6, v0
-; GCN-NEXT:    v_readfirstlane_b32 s7, v1
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
-; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s45, 11
+; GCN-NEXT:    v_writelane_b32 v40, s46, 12
+; GCN-NEXT:    v_writelane_b32 v40, s47, 13
+; GCN-NEXT:    v_writelane_b32 v40, s48, 14
+; GCN-NEXT:    v_writelane_b32 v40, s49, 15
+; GCN-NEXT:    v_writelane_b32 v40, s50, 16
+; GCN-NEXT:    v_writelane_b32 v40, s51, 17
+; GCN-NEXT:    v_writelane_b32 v40, s52, 18
+; GCN-NEXT:    v_writelane_b32 v40, s53, 19
+; GCN-NEXT:    v_writelane_b32 v40, s54, 20
+; GCN-NEXT:    v_writelane_b32 v40, s55, 21
+; GCN-NEXT:    v_writelane_b32 v40, s56, 22
+; GCN-NEXT:    v_writelane_b32 v40, s57, 23
+; GCN-NEXT:    v_writelane_b32 v40, s58, 24
+; GCN-NEXT:    v_writelane_b32 v40, s59, 25
+; GCN-NEXT:    v_writelane_b32 v40, s60, 26
+; GCN-NEXT:    v_writelane_b32 v40, s61, 27
+; GCN-NEXT:    v_writelane_b32 v40, s62, 28
+; GCN-NEXT:    v_writelane_b32 v40, s63, 29
+; GCN-NEXT:    s_mov_b64 s[6:7], s[30:31]
+; GCN-NEXT:    s_mov_b64 s[8:9], exec
 ; GCN-NEXT:    s_movk_i32 s4, 0x7b
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[12:13], v[0:1]
+; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[10:11]
 ; GCN-NEXT:    s_cbranch_execnz BB6_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    v_readlane_b32 s4, v40, 4
-; GCN-NEXT:    v_readlane_b32 s5, v40, 5
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_readlane_b32 s63, v40, 29
+; GCN-NEXT:    v_readlane_b32 s62, v40, 28
+; GCN-NEXT:    v_readlane_b32 s61, v40, 27
+; GCN-NEXT:    v_readlane_b32 s60, v40, 26
+; GCN-NEXT:    v_readlane_b32 s59, v40, 25
+; GCN-NEXT:    v_readlane_b32 s58, v40, 24
+; GCN-NEXT:    v_readlane_b32 s57, v40, 23
+; GCN-NEXT:    v_readlane_b32 s56, v40, 22
+; GCN-NEXT:    v_readlane_b32 s55, v40, 21
+; GCN-NEXT:    v_readlane_b32 s54, v40, 20
+; GCN-NEXT:    v_readlane_b32 s53, v40, 19
+; GCN-NEXT:    v_readlane_b32 s52, v40, 18
+; GCN-NEXT:    v_readlane_b32 s51, v40, 17
+; GCN-NEXT:    v_readlane_b32 s50, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 15
+; GCN-NEXT:    v_readlane_b32 s48, v40, 14
+; GCN-NEXT:    v_readlane_b32 s47, v40, 13
+; GCN-NEXT:    v_readlane_b32 s46, v40, 12
+; GCN-NEXT:    v_readlane_b32 s45, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
 ; GCN-NEXT:    v_readlane_b32 s37, v40, 3
 ; GCN-NEXT:    v_readlane_b32 s36, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s35, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v40, 6
-; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    v_readlane_b32 s33, v40, 30
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[4:5]
+; GCN-NEXT:    s_setpc_b64 s[6:7]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
 ; GISEL:       ; %bb.0:
@@ -1116,41 +1165,90 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_writelane_b32 v40, s33, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s33, 30
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
 ; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
 ; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
 ; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 5
-; GISEL-NEXT:    s_mov_b64 s[34:35], exec
-; GISEL-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GISEL-NEXT:    v_readfirstlane_b32 s7, v1
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
-; GISEL-NEXT:    s_and_saveexec_b64 s[36:37], vcc
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s45, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 19
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 20
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 21
+; GISEL-NEXT:    v_writelane_b32 v40, s56, 22
+; GISEL-NEXT:    v_writelane_b32 v40, s57, 23
+; GISEL-NEXT:    v_writelane_b32 v40, s58, 24
+; GISEL-NEXT:    v_writelane_b32 v40, s59, 25
+; GISEL-NEXT:    v_writelane_b32 v40, s60, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s61, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s62, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s63, 29
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[30:31]
 ; GISEL-NEXT:    s_movk_i32 s4, 0x7b
-; GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GISEL-NEXT:    s_mov_b64 s[8:9], exec
+; GISEL-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT:    v_readfirstlane_b32 s10, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s11, v1
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    s_cbranch_execnz BB6_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[34:35]
-; GISEL-NEXT:    v_readlane_b32 s4, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s5, v40, 5
+; GISEL-NEXT:    s_mov_b64 exec, s[8:9]
+; GISEL-NEXT:    v_readlane_b32 s63, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s62, v40, 28
+; GISEL-NEXT:    v_readlane_b32 s61, v40, 27
+; GISEL-NEXT:    v_readlane_b32 s60, v40, 26
+; GISEL-NEXT:    v_readlane_b32 s59, v40, 25
+; GISEL-NEXT:    v_readlane_b32 s58, v40, 24
+; GISEL-NEXT:    v_readlane_b32 s57, v40, 23
+; GISEL-NEXT:    v_readlane_b32 s56, v40, 22
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 21
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 20
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s45, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
 ; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
 ; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    v_readlane_b32 s33, v40, 6
-; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GISEL-NEXT:    v_readlane_b32 s33, v40, 30
+; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_setpc_b64 s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[6:7]
   call amdgpu_gfx void %fptr(i32 inreg 123)
   ret void
 }
@@ -1162,7 +1260,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v40, s33, 6
+; GCN-NEXT:    v_writelane_b32 v40, s33, 30
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1170,32 +1268,81 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
 ; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    v_writelane_b32 v40, s36, 2
 ; GCN-NEXT:    v_writelane_b32 v40, s37, 3
-; GCN-NEXT:    v_writelane_b32 v40, s30, 4
-; GCN-NEXT:    v_writelane_b32 v40, s31, 5
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s45, 11
+; GCN-NEXT:    v_writelane_b32 v40, s46, 12
+; GCN-NEXT:    v_writelane_b32 v40, s47, 13
+; GCN-NEXT:    v_writelane_b32 v40, s48, 14
+; GCN-NEXT:    v_writelane_b32 v40, s49, 15
+; GCN-NEXT:    v_writelane_b32 v40, s50, 16
+; GCN-NEXT:    v_writelane_b32 v40, s51, 17
+; GCN-NEXT:    v_writelane_b32 v40, s52, 18
+; GCN-NEXT:    v_writelane_b32 v40, s53, 19
+; GCN-NEXT:    v_writelane_b32 v40, s54, 20
+; GCN-NEXT:    v_writelane_b32 v40, s55, 21
+; GCN-NEXT:    v_writelane_b32 v40, s56, 22
+; GCN-NEXT:    v_writelane_b32 v40, s57, 23
+; GCN-NEXT:    v_writelane_b32 v40, s58, 24
+; GCN-NEXT:    v_writelane_b32 v40, s59, 25
+; GCN-NEXT:    v_writelane_b32 v40, s60, 26
+; GCN-NEXT:    v_writelane_b32 v40, s61, 27
+; GCN-NEXT:    v_writelane_b32 v40, s62, 28
+; GCN-NEXT:    v_writelane_b32 v40, s63, 29
+; GCN-NEXT:    s_mov_b64 s[4:5], s[30:31]
 ; GCN-NEXT:    v_mov_b32_e32 v41, v0
-; GCN-NEXT:    s_mov_b64 s[34:35], exec
+; GCN-NEXT:    s_mov_b64 s[6:7], exec
 ; GCN-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s4, v1
-; GCN-NEXT:    v_readfirstlane_b32 s5, v2
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
-; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[1:2]
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GCN-NEXT:    v_mov_b32_e32 v0, v41
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execnz BB7_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v41
-; GCN-NEXT:    v_readlane_b32 s4, v40, 4
-; GCN-NEXT:    v_readlane_b32 s5, v40, 5
+; GCN-NEXT:    v_readlane_b32 s63, v40, 29
+; GCN-NEXT:    v_readlane_b32 s62, v40, 28
+; GCN-NEXT:    v_readlane_b32 s61, v40, 27
+; GCN-NEXT:    v_readlane_b32 s60, v40, 26
+; GCN-NEXT:    v_readlane_b32 s59, v40, 25
+; GCN-NEXT:    v_readlane_b32 s58, v40, 24
+; GCN-NEXT:    v_readlane_b32 s57, v40, 23
+; GCN-NEXT:    v_readlane_b32 s56, v40, 22
+; GCN-NEXT:    v_readlane_b32 s55, v40, 21
+; GCN-NEXT:    v_readlane_b32 s54, v40, 20
+; GCN-NEXT:    v_readlane_b32 s53, v40, 19
+; GCN-NEXT:    v_readlane_b32 s52, v40, 18
+; GCN-NEXT:    v_readlane_b32 s51, v40, 17
+; GCN-NEXT:    v_readlane_b32 s50, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 15
+; GCN-NEXT:    v_readlane_b32 s48, v40, 14
+; GCN-NEXT:    v_readlane_b32 s47, v40, 13
+; GCN-NEXT:    v_readlane_b32 s46, v40, 12
+; GCN-NEXT:    v_readlane_b32 s45, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
 ; GCN-NEXT:    v_readlane_b32 s37, v40, 3
 ; GCN-NEXT:    v_readlane_b32 s36, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s35, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v40, 6
+; GCN-NEXT:    v_readlane_b32 s33, v40, 30
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
@@ -1208,7 +1355,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_writelane_b32 v40, s33, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s33, 30
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -1216,32 +1363,81 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
 ; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
 ; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
 ; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s45, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 19
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 20
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 21
+; GISEL-NEXT:    v_writelane_b32 v40, s56, 22
+; GISEL-NEXT:    v_writelane_b32 v40, s57, 23
+; GISEL-NEXT:    v_writelane_b32 v40, s58, 24
+; GISEL-NEXT:    v_writelane_b32 v40, s59, 25
+; GISEL-NEXT:    v_writelane_b32 v40, s60, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s61, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s62, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s63, 29
 ; GISEL-NEXT:    v_mov_b32_e32 v41, v0
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 5
-; GISEL-NEXT:    s_mov_b64 s[34:35], exec
+; GISEL-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GISEL-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GISEL-NEXT:    v_readfirstlane_b32 s5, v2
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
-; GISEL-NEXT:    s_and_saveexec_b64 s[36:37], vcc
+; GISEL-NEXT:    v_readfirstlane_b32 s8, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s9, v2
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v41
-; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]
 ; GISEL-NEXT:    s_cbranch_execnz BB7_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[34:35]
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v41
-; GISEL-NEXT:    v_readlane_b32 s4, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s5, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s63, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s62, v40, 28
+; GISEL-NEXT:    v_readlane_b32 s61, v40, 27
+; GISEL-NEXT:    v_readlane_b32 s60, v40, 26
+; GISEL-NEXT:    v_readlane_b32 s59, v40, 25
+; GISEL-NEXT:    v_readlane_b32 s58, v40, 24
+; GISEL-NEXT:    v_readlane_b32 s57, v40, 23
+; GISEL-NEXT:    v_readlane_b32 s56, v40, 22
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 21
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 20
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s45, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
 ; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
 ; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    v_readlane_b32 s33, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s33, v40, 30
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
@@ -1262,38 +1458,87 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v40, s33, 6
+; GCN-NEXT:    v_writelane_b32 v40, s33, 30
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    v_writelane_b32 v40, s36, 2
 ; GCN-NEXT:    v_writelane_b32 v40, s37, 3
-; GCN-NEXT:    v_writelane_b32 v40, s30, 4
-; GCN-NEXT:    v_writelane_b32 v40, s31, 5
-; GCN-NEXT:    s_mov_b64 s[34:35], exec
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s45, 11
+; GCN-NEXT:    v_writelane_b32 v40, s46, 12
+; GCN-NEXT:    v_writelane_b32 v40, s47, 13
+; GCN-NEXT:    v_writelane_b32 v40, s48, 14
+; GCN-NEXT:    v_writelane_b32 v40, s49, 15
+; GCN-NEXT:    v_writelane_b32 v40, s50, 16
+; GCN-NEXT:    v_writelane_b32 v40, s51, 17
+; GCN-NEXT:    v_writelane_b32 v40, s52, 18
+; GCN-NEXT:    v_writelane_b32 v40, s53, 19
+; GCN-NEXT:    v_writelane_b32 v40, s54, 20
+; GCN-NEXT:    v_writelane_b32 v40, s55, 21
+; GCN-NEXT:    v_writelane_b32 v40, s56, 22
+; GCN-NEXT:    v_writelane_b32 v40, s57, 23
+; GCN-NEXT:    v_writelane_b32 v40, s58, 24
+; GCN-NEXT:    v_writelane_b32 v40, s59, 25
+; GCN-NEXT:    v_writelane_b32 v40, s60, 26
+; GCN-NEXT:    v_writelane_b32 v40, s61, 27
+; GCN-NEXT:    v_writelane_b32 v40, s62, 28
+; GCN-NEXT:    v_writelane_b32 v40, s63, 29
+; GCN-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GCN-NEXT:    s_mov_b64 s[6:7], exec
 ; GCN-NEXT:  BB8_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s4, v1
-; GCN-NEXT:    v_readfirstlane_b32 s5, v2
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
-; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[1:2]
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GCN-NEXT:    v_mov_b32_e32 v3, v0
 ; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execnz BB8_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
-; GCN-NEXT:    v_readlane_b32 s4, v40, 4
-; GCN-NEXT:    v_readlane_b32 s5, v40, 5
+; GCN-NEXT:    v_readlane_b32 s63, v40, 29
+; GCN-NEXT:    v_readlane_b32 s62, v40, 28
+; GCN-NEXT:    v_readlane_b32 s61, v40, 27
+; GCN-NEXT:    v_readlane_b32 s60, v40, 26
+; GCN-NEXT:    v_readlane_b32 s59, v40, 25
+; GCN-NEXT:    v_readlane_b32 s58, v40, 24
+; GCN-NEXT:    v_readlane_b32 s57, v40, 23
+; GCN-NEXT:    v_readlane_b32 s56, v40, 22
+; GCN-NEXT:    v_readlane_b32 s55, v40, 21
+; GCN-NEXT:    v_readlane_b32 s54, v40, 20
+; GCN-NEXT:    v_readlane_b32 s53, v40, 19
+; GCN-NEXT:    v_readlane_b32 s52, v40, 18
+; GCN-NEXT:    v_readlane_b32 s51, v40, 17
+; GCN-NEXT:    v_readlane_b32 s50, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 15
+; GCN-NEXT:    v_readlane_b32 s48, v40, 14
+; GCN-NEXT:    v_readlane_b32 s47, v40, 13
+; GCN-NEXT:    v_readlane_b32 s46, v40, 12
+; GCN-NEXT:    v_readlane_b32 s45, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
 ; GCN-NEXT:    v_readlane_b32 s37, v40, 3
 ; GCN-NEXT:    v_readlane_b32 s36, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s35, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v40, 6
+; GCN-NEXT:    v_readlane_b32 s33, v40, 30
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
@@ -1306,38 +1551,87 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_writelane_b32 v40, s33, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s33, 30
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
 ; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
 ; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
 ; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 5
-; GISEL-NEXT:    s_mov_b64 s[34:35], exec
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s45, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 19
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 20
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 21
+; GISEL-NEXT:    v_writelane_b32 v40, s56, 22
+; GISEL-NEXT:    v_writelane_b32 v40, s57, 23
+; GISEL-NEXT:    v_writelane_b32 v40, s58, 24
+; GISEL-NEXT:    v_writelane_b32 v40, s59, 25
+; GISEL-NEXT:    v_writelane_b32 v40, s60, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s61, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s62, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s63, 29
+; GISEL-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GISEL-NEXT:  BB8_1: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GISEL-NEXT:    v_readfirstlane_b32 s5, v2
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
-; GISEL-NEXT:    s_and_saveexec_b64 s[36:37], vcc
-; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    v_readfirstlane_b32 s8, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s9, v2
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    v_mov_b32_e32 v3, v0
 ; GISEL-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]
 ; GISEL-NEXT:    s_cbranch_execnz BB8_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[34:35]
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_readlane_b32 s4, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s5, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s63, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s62, v40, 28
+; GISEL-NEXT:    v_readlane_b32 s61, v40, 27
+; GISEL-NEXT:    v_readlane_b32 s60, v40, 26
+; GISEL-NEXT:    v_readlane_b32 s59, v40, 25
+; GISEL-NEXT:    v_readlane_b32 s58, v40, 24
+; GISEL-NEXT:    v_readlane_b32 s57, v40, 23
+; GISEL-NEXT:    v_readlane_b32 s56, v40, 22
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 21
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 20
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s45, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
 ; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
 ; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    v_readlane_b32 s33, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s33, v40, 30
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
@@ -1355,35 +1649,84 @@ define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v40, s33, 6
+; GCN-NEXT:    v_writelane_b32 v40, s33, 30
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    v_writelane_b32 v40, s36, 2
 ; GCN-NEXT:    v_writelane_b32 v40, s37, 3
-; GCN-NEXT:    v_writelane_b32 v40, s30, 4
-; GCN-NEXT:    v_writelane_b32 v40, s31, 5
-; GCN-NEXT:    s_mov_b64 s[34:35], exec
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
+; GCN-NEXT:    v_writelane_b32 v40, s45, 11
+; GCN-NEXT:    v_writelane_b32 v40, s46, 12
+; GCN-NEXT:    v_writelane_b32 v40, s47, 13
+; GCN-NEXT:    v_writelane_b32 v40, s48, 14
+; GCN-NEXT:    v_writelane_b32 v40, s49, 15
+; GCN-NEXT:    v_writelane_b32 v40, s50, 16
+; GCN-NEXT:    v_writelane_b32 v40, s51, 17
+; GCN-NEXT:    v_writelane_b32 v40, s52, 18
+; GCN-NEXT:    v_writelane_b32 v40, s53, 19
+; GCN-NEXT:    v_writelane_b32 v40, s54, 20
+; GCN-NEXT:    v_writelane_b32 v40, s55, 21
+; GCN-NEXT:    v_writelane_b32 v40, s56, 22
+; GCN-NEXT:    v_writelane_b32 v40, s57, 23
+; GCN-NEXT:    v_writelane_b32 v40, s58, 24
+; GCN-NEXT:    v_writelane_b32 v40, s59, 25
+; GCN-NEXT:    v_writelane_b32 v40, s60, 26
+; GCN-NEXT:    v_writelane_b32 v40, s61, 27
+; GCN-NEXT:    v_writelane_b32 v40, s62, 28
+; GCN-NEXT:    v_writelane_b32 v40, s63, 29
+; GCN-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GCN-NEXT:    s_mov_b64 s[6:7], exec
 ; GCN-NEXT:  BB9_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NEXT:    v_readfirstlane_b32 s5, v1
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    v_readfirstlane_b32 s11, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execnz BB9_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    v_readlane_b32 s4, v40, 4
-; GCN-NEXT:    v_readlane_b32 s5, v40, 5
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    v_readlane_b32 s63, v40, 29
+; GCN-NEXT:    v_readlane_b32 s62, v40, 28
+; GCN-NEXT:    v_readlane_b32 s61, v40, 27
+; GCN-NEXT:    v_readlane_b32 s60, v40, 26
+; GCN-NEXT:    v_readlane_b32 s59, v40, 25
+; GCN-NEXT:    v_readlane_b32 s58, v40, 24
+; GCN-NEXT:    v_readlane_b32 s57, v40, 23
+; GCN-NEXT:    v_readlane_b32 s56, v40, 22
+; GCN-NEXT:    v_readlane_b32 s55, v40, 21
+; GCN-NEXT:    v_readlane_b32 s54, v40, 20
+; GCN-NEXT:    v_readlane_b32 s53, v40, 19
+; GCN-NEXT:    v_readlane_b32 s52, v40, 18
+; GCN-NEXT:    v_readlane_b32 s51, v40, 17
+; GCN-NEXT:    v_readlane_b32 s50, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 15
+; GCN-NEXT:    v_readlane_b32 s48, v40, 14
+; GCN-NEXT:    v_readlane_b32 s47, v40, 13
+; GCN-NEXT:    v_readlane_b32 s46, v40, 12
+; GCN-NEXT:    v_readlane_b32 s45, v40, 11
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
 ; GCN-NEXT:    v_readlane_b32 s37, v40, 3
 ; GCN-NEXT:    v_readlane_b32 s36, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s35, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v40, 6
+; GCN-NEXT:    v_readlane_b32 s33, v40, 30
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
@@ -1396,35 +1739,84 @@ define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_writelane_b32 v40, s33, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s33, 30
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s34, 0
 ; GISEL-NEXT:    v_writelane_b32 v40, s35, 1
 ; GISEL-NEXT:    v_writelane_b32 v40, s36, 2
 ; GISEL-NEXT:    v_writelane_b32 v40, s37, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 5
-; GISEL-NEXT:    s_mov_b64 s[34:35], exec
+; GISEL-NEXT:    v_writelane_b32 v40, s38, 4
+; GISEL-NEXT:    v_writelane_b32 v40, s39, 5
+; GISEL-NEXT:    v_writelane_b32 v40, s40, 6
+; GISEL-NEXT:    v_writelane_b32 v40, s41, 7
+; GISEL-NEXT:    v_writelane_b32 v40, s42, 8
+; GISEL-NEXT:    v_writelane_b32 v40, s43, 9
+; GISEL-NEXT:    v_writelane_b32 v40, s44, 10
+; GISEL-NEXT:    v_writelane_b32 v40, s45, 11
+; GISEL-NEXT:    v_writelane_b32 v40, s46, 12
+; GISEL-NEXT:    v_writelane_b32 v40, s47, 13
+; GISEL-NEXT:    v_writelane_b32 v40, s48, 14
+; GISEL-NEXT:    v_writelane_b32 v40, s49, 15
+; GISEL-NEXT:    v_writelane_b32 v40, s50, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s51, 17
+; GISEL-NEXT:    v_writelane_b32 v40, s52, 18
+; GISEL-NEXT:    v_writelane_b32 v40, s53, 19
+; GISEL-NEXT:    v_writelane_b32 v40, s54, 20
+; GISEL-NEXT:    v_writelane_b32 v40, s55, 21
+; GISEL-NEXT:    v_writelane_b32 v40, s56, 22
+; GISEL-NEXT:    v_writelane_b32 v40, s57, 23
+; GISEL-NEXT:    v_writelane_b32 v40, s58, 24
+; GISEL-NEXT:    v_writelane_b32 v40, s59, 25
+; GISEL-NEXT:    v_writelane_b32 v40, s60, 26
+; GISEL-NEXT:    v_writelane_b32 v40, s61, 27
+; GISEL-NEXT:    v_writelane_b32 v40, s62, 28
+; GISEL-NEXT:    v_writelane_b32 v40, s63, 29
+; GISEL-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GISEL-NEXT:  BB9_1: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    s_and_saveexec_b64 s[36:37], vcc
-; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s9, v1
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]
 ; GISEL-NEXT:    s_cbranch_execnz BB9_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[34:35]
-; GISEL-NEXT:    v_readlane_b32 s4, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s5, v40, 5
+; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; GISEL-NEXT:    v_readlane_b32 s63, v40, 29
+; GISEL-NEXT:    v_readlane_b32 s62, v40, 28
+; GISEL-NEXT:    v_readlane_b32 s61, v40, 27
+; GISEL-NEXT:    v_readlane_b32 s60, v40, 26
+; GISEL-NEXT:    v_readlane_b32 s59, v40, 25
+; GISEL-NEXT:    v_readlane_b32 s58, v40, 24
+; GISEL-NEXT:    v_readlane_b32 s57, v40, 23
+; GISEL-NEXT:    v_readlane_b32 s56, v40, 22
+; GISEL-NEXT:    v_readlane_b32 s55, v40, 21
+; GISEL-NEXT:    v_readlane_b32 s54, v40, 20
+; GISEL-NEXT:    v_readlane_b32 s53, v40, 19
+; GISEL-NEXT:    v_readlane_b32 s52, v40, 18
+; GISEL-NEXT:    v_readlane_b32 s51, v40, 17
+; GISEL-NEXT:    v_readlane_b32 s50, v40, 16
+; GISEL-NEXT:    v_readlane_b32 s49, v40, 15
+; GISEL-NEXT:    v_readlane_b32 s48, v40, 14
+; GISEL-NEXT:    v_readlane_b32 s47, v40, 13
+; GISEL-NEXT:    v_readlane_b32 s46, v40, 12
+; GISEL-NEXT:    v_readlane_b32 s45, v40, 11
+; GISEL-NEXT:    v_readlane_b32 s44, v40, 10
+; GISEL-NEXT:    v_readlane_b32 s43, v40, 9
+; GISEL-NEXT:    v_readlane_b32 s42, v40, 8
+; GISEL-NEXT:    v_readlane_b32 s41, v40, 7
+; GISEL-NEXT:    v_readlane_b32 s40, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s39, v40, 5
+; GISEL-NEXT:    v_readlane_b32 s38, v40, 4
 ; GISEL-NEXT:    v_readlane_b32 s37, v40, 3
 ; GISEL-NEXT:    v_readlane_b32 s36, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s35, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 0
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    v_readlane_b32 s33, v40, 6
+; GISEL-NEXT:    v_readlane_b32 s33, v40, 30
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]

diff  --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index ebb05a008681..3a5654f63a42 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,15 +8,15 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:SGPR_128 */, def %4
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5439498 /* regdef:SGPR_128 */, def %4
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5439497 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:SGPR_128 */, def %4
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5439498 /* regdef:SGPR_128 */, def %4
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5439497 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -26,15 +26,15 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4718602 /* regdef:VReg_128 */, def %4
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5046282 /* regdef:VReg_128 */, def %4
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %4
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4718601 /* reguse:VReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4849674 /* regdef:VReg_128_Align2 */, def %4
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5177354 /* regdef:VReg_128_Align2 */, def %4
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4849673 /* reguse:VReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5177353 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -44,15 +44,15 @@ define amdgpu_kernel void @v_input_output_i128() {
 define amdgpu_kernel void @a_input_output_i128() {
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4653066 /* regdef:AReg_128 */, def %4
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4980746 /* regdef:AReg_128 */, def %4
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %4
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4980745 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:AReg_128_Align2 */, def %4
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:AReg_128_Align2 */, def %4
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:AReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)

diff  --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
index f890ab2625f1..d9d511f72ae8 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
@@ -28,31 +28,32 @@ define amdgpu_cs void @test_simple_indirect_call() {
 ; Attributor adds work-group-size attribute. This should be ok.
 ; GFX9-LABEL: test_simple_indirect_call:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_getpc_b64 s[36:37]
-; GFX9-NEXT:    s_mov_b32 s36, s0
-; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[36:37], 0x10
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s36, s36, s0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_add_u32 s8, s8, s0
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[10:11]
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: test_simple_indirect_call:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_getpc_b64 s[36:37]
-; GFX10-NEXT:    s_mov_b32 s36, s0
+; GFX10-NEXT:    s_getpc_b64 s[8:9]
+; GFX10-NEXT:    s_mov_b32 s8, s0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[36:37], 0x10
+; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bitset0_b32 s39, 21
-; GFX10-NEXT:    s_add_u32 s36, s36, s0
-; GFX10-NEXT:    s_addc_u32 s37, s37, 0
-; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX10-NEXT:    s_bitset0_b32 s11, 21
+; GFX10-NEXT:    s_add_u32 s8, s8, s0
+; GFX10-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX10-NEXT:    s_mov_b64 s[2:3], s[10:11]
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 

diff  --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
index e6de73faefbe..4894eeca5825 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
@@ -17,12 +17,28 @@ define amdgpu_gfx float @caller(float %arg0) {
 ; GCN-LABEL: caller:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    v_writelane_b32 v1, s33, 1
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v1, s4, 0
 ; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    s_mov_b32 s4, 2.0
-; GCN-NEXT:    s_getpc_b64 s[6:7]
-; GCN-NEXT:    s_add_u32 s6, s6, callee at rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s7, s7, callee at rel32@hi+12
-; GCN-NEXT:    s_setpc_b64 s[6:7]
+; GCN-NEXT:    s_mov_b64 s[36:37], s[30:31]
+; GCN-NEXT:    s_getpc_b64 s[30:31]
+; GCN-NEXT:    s_add_u32 s30, s30, callee at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s31, s31, callee at rel32@hi+12
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GCN-NEXT:    v_readlane_b32 s4, v1, 0
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v1, 1
+; GCN-NEXT:    s_or_saveexec_b64 s[30:31], -1
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[30:31]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[36:37]
   %add = fadd float %arg0, 1.0
   %call = tail call amdgpu_gfx float @callee(float %add, float inreg 2.0)
   ret float %call

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 9b6929d29c0c..4061eda1b74f 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -261,7 +261,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
   ; SI-NEXT:   [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]]
   ; SI-NEXT:   $vgpr0 = COPY killed [[PHI5]]
-  ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
+  ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; SI-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
   ; SI-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc
@@ -294,7 +294,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
   ; SI-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]]
   ; SI-NEXT:   $vgpr0 = COPY killed [[PHI7]]
-  ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
+  ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; SI-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
   ; SI-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc
@@ -374,7 +374,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(
   ; SI-NEXT:   [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]]
   ; SI-NEXT:   $vgpr0 = COPY [[COPY4]]
-  ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
+  ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; SI-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
   ; SI-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc
@@ -406,7 +406,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(
   ; SI-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]]
   ; SI-NEXT:   $vgpr0 = COPY [[COPY4]]
-  ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
+  ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; SI-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
   ; SI-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index 3087fdf9d547..57841c00355d 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -158,61 +158,61 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
 ; SI-LABEL: loop:
 ; SI:       ; %bb.0: ; %main_body
 ; SI-NEXT:    v_mov_b32_e32 v6, v0
-; SI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_mov_b32 s38, -1
+; SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s14, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, v1
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 6, v6
-; SI-NEXT:    s_mov_b32 s39, 0x31c16000
-; SI-NEXT:    s_add_u32 s36, s36, s1
-; SI-NEXT:    s_addc_u32 s37, s37, 0
+; SI-NEXT:    s_mov_b32 s15, 0x31c16000
+; SI-NEXT:    s_add_u32 s12, s12, s1
+; SI-NEXT:    s_addc_u32 s13, s13, 0
 ; SI-NEXT:    s_mov_b32 s32, 0
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; SI-NEXT:    s_xor_b32 s33, exec_lo, s0
+; SI-NEXT:    s_xor_b32 s4, exec_lo, s0
 ; SI-NEXT:    s_cbranch_execz BB3_4
 ; SI-NEXT:  ; %bb.1: ; %else
-; SI-NEXT:    s_mov_b32 s34, exec_lo
+; SI-NEXT:    s_mov_b32 s5, exec_lo
 ; SI-NEXT:  BB3_2: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_readfirstlane_b32 s4, v4
-; SI-NEXT:    v_readfirstlane_b32 s5, v5
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; SI-NEXT:    s_and_saveexec_b32 s35, vcc_lo
-; SI-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; SI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; SI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SI-NEXT:    v_readfirstlane_b32 s6, v4
+; SI-NEXT:    v_readfirstlane_b32 s7, v5
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[4:5]
+; SI-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; SI-NEXT:    s_mov_b64 s[0:1], s[12:13]
+; SI-NEXT:    s_mov_b64 s[2:3], s[14:15]
+; SI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s35
+; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s8
 ; SI-NEXT:    s_cbranch_execnz BB3_2
 ; SI-NEXT:  ; %bb.3:
-; SI-NEXT:    s_mov_b32 exec_lo, s34
+; SI-NEXT:    s_mov_b32 exec_lo, s5
 ; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:  BB3_4: ; %Flow
-; SI-NEXT:    s_or_saveexec_b32 s33, s33
-; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s33
+; SI-NEXT:    s_or_saveexec_b32 s4, s4
+; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; SI-NEXT:    s_cbranch_execz BB3_8
 ; SI-NEXT:  ; %bb.5: ; %if
-; SI-NEXT:    s_mov_b32 s34, exec_lo
+; SI-NEXT:    s_mov_b32 s5, exec_lo
 ; SI-NEXT:  BB3_6: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    v_readfirstlane_b32 s5, v3
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
-; SI-NEXT:    s_and_saveexec_b32 s35, vcc_lo
-; SI-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; SI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; SI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SI-NEXT:    v_readfirstlane_b32 s6, v2
+; SI-NEXT:    v_readfirstlane_b32 s7, v3
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[2:3]
+; SI-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; SI-NEXT:    s_mov_b64 s[0:1], s[12:13]
+; SI-NEXT:    s_mov_b64 s[2:3], s[14:15]
+; SI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s35
+; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s8
 ; SI-NEXT:    s_cbranch_execnz BB3_6
 ; SI-NEXT:  ; %bb.7:
-; SI-NEXT:    s_mov_b32 exec_lo, s34
+; SI-NEXT:    s_mov_b32 exec_lo, s5
 ; SI-NEXT:  BB3_8: ; %end
-; SI-NEXT:    s_or_b32 exec_lo, exec_lo, s33
+; SI-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; SI-NEXT:    v_mov_b32_e32 v0, v1
 ; SI-NEXT:    ; return to shader part epilog
 main_body:
@@ -236,58 +236,58 @@ end:
 define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 {
 ; SI-LABEL: loop_with_use:
 ; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_mov_b32 s38, -1
+; SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s14, -1
 ; SI-NEXT:    v_mov_b32_e32 v40, v1
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 6, v0
-; SI-NEXT:    s_mov_b32 s39, 0x31c16000
-; SI-NEXT:    s_add_u32 s36, s36, s1
-; SI-NEXT:    s_addc_u32 s37, s37, 0
+; SI-NEXT:    s_mov_b32 s15, 0x31c16000
+; SI-NEXT:    s_add_u32 s12, s12, s1
+; SI-NEXT:    s_addc_u32 s13, s13, 0
 ; SI-NEXT:    s_mov_b32 s32, 0
 ; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; SI-NEXT:    s_xor_b32 s33, exec_lo, s0
+; SI-NEXT:    s_xor_b32 s4, exec_lo, s0
 ; SI-NEXT:    s_cbranch_execz BB4_4
 ; SI-NEXT:  ; %bb.1: ; %else
-; SI-NEXT:    s_mov_b32 s34, exec_lo
+; SI-NEXT:    s_mov_b32 s5, exec_lo
 ; SI-NEXT:  BB4_2: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_readfirstlane_b32 s4, v4
-; SI-NEXT:    v_readfirstlane_b32 s5, v5
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; SI-NEXT:    s_and_saveexec_b32 s35, vcc_lo
+; SI-NEXT:    v_readfirstlane_b32 s6, v4
+; SI-NEXT:    v_readfirstlane_b32 s7, v5
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[4:5]
+; SI-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; SI-NEXT:    v_mov_b32_e32 v0, v40
-; SI-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; SI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; SI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SI-NEXT:    s_mov_b64 s[0:1], s[12:13]
+; SI-NEXT:    s_mov_b64 s[2:3], s[14:15]
+; SI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; SI-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s35
+; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s8
 ; SI-NEXT:    s_cbranch_execnz BB4_2
 ; SI-NEXT:  ; %bb.3:
-; SI-NEXT:    s_mov_b32 exec_lo, s34
+; SI-NEXT:    s_mov_b32 exec_lo, s5
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:  BB4_4: ; %Flow
-; SI-NEXT:    s_or_saveexec_b32 s33, s33
-; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s33
+; SI-NEXT:    s_or_saveexec_b32 s4, s4
+; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; SI-NEXT:    s_cbranch_execz BB4_8
 ; SI-NEXT:  ; %bb.5: ; %if
-; SI-NEXT:    s_mov_b32 s34, exec_lo
+; SI-NEXT:    s_mov_b32 s5, exec_lo
 ; SI-NEXT:  BB4_6: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    v_readfirstlane_b32 s5, v3
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
-; SI-NEXT:    s_and_saveexec_b32 s35, vcc_lo
+; SI-NEXT:    v_readfirstlane_b32 s6, v2
+; SI-NEXT:    v_readfirstlane_b32 s7, v3
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[2:3]
+; SI-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; SI-NEXT:    v_mov_b32_e32 v0, v40
-; SI-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; SI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; SI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SI-NEXT:    s_mov_b64 s[0:1], s[12:13]
+; SI-NEXT:    s_mov_b64 s[2:3], s[14:15]
+; SI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s35
+; SI-NEXT:    s_xor_b32 exec_lo, exec_lo, s8
 ; SI-NEXT:    s_cbranch_execnz BB4_6
 ; SI-NEXT:  ; %bb.7:
-; SI-NEXT:    s_mov_b32 exec_lo, s34
+; SI-NEXT:    s_mov_b32 exec_lo, s5
 ; SI-NEXT:  BB4_8: ; %end
-; SI-NEXT:    s_or_b32 exec_lo, exec_lo, s33
+; SI-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v40
 ; SI-NEXT:    ; return to shader part epilog
 main_body:

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 3d655d23e73c..f796910ee601 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -6,77 +6,75 @@ define amdgpu_gfx void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) {
 ; GFX9-O0-LABEL: strict_wwm_no_cfg:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX9-O0-NEXT:    s_mov_b32 s8, s7
-; GFX9-O0-NEXT:    s_mov_b32 s9, s6
-; GFX9-O0-NEXT:    s_mov_b32 s10, s5
-; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s5, s10
-; GFX9-O0-NEXT:    s_mov_b32 s6, s9
-; GFX9-O0-NEXT:    s_mov_b32 s7, s8
-; GFX9-O0-NEXT:    ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s8, 0
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[5:6], off, s[4:7], s8
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    s_mov_b32 s36, s4
+; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
+; GFX9-O0-NEXT:    s_mov_b32 s37, s5
+; GFX9-O0-NEXT:    s_mov_b32 s38, s6
+; GFX9-O0-NEXT:    s_mov_b32 s39, s7
+; GFX9-O0-NEXT:    ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39
+; GFX9-O0-NEXT:    s_mov_b32 s34, 0
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[5:6], off, s[36:39], s34
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v0, v0, v2
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v0, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v0, v1, v0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v3, v4
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[10:11]
-; GFX9-O0-NEXT:    s_mov_b32 s9, 1
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s9, v3
-; GFX9-O0-NEXT:    s_mov_b32 s9, 2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[40:41], v3, v4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[40:41]
+; GFX9-O0-NEXT:    s_mov_b32 s35, 1
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s35, v3
+; GFX9-O0-NEXT:    s_mov_b32 s35, 2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX9-O0-NEXT:    v_and_b32_e32 v3, v3, v4
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[4:7], s8 offset:4
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[36:39], s34 offset:4
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_no_cfg:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
@@ -84,12 +82,12 @@ define amdgpu_gfx void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) {
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    v_add_u32_e32 v0, v3, v0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
@@ -97,14 +95,14 @@ define amdgpu_gfx void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) {
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
 ; GFX9-O3-NEXT:    v_and_b32_e32 v4, 2, v4
 ; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0)
@@ -136,52 +134,51 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
 ; GFX9-O0-LABEL: strict_wwm_cfg:
 ; GFX9-O0:       ; %bb.0: ; %entry
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    v_writelane_b32 v5, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX9-O0-NEXT:    s_mov_b32 s8, s4
-; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
-; GFX9-O0-NEXT:    s_mov_b32 s9, s5
-; GFX9-O0-NEXT:    s_mov_b32 s10, s6
-; GFX9-O0-NEXT:    s_mov_b32 s11, s7
-; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[8:9]
-; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[10:11]
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s5, 3
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s6, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s7, 5
-; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], off, s[8:11], s4
+; GFX9-O0-NEXT:    s_mov_b32 s36, s4
+; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
+; GFX9-O0-NEXT:    s_mov_b32 s37, s5
+; GFX9-O0-NEXT:    s_mov_b32 s38, s6
+; GFX9-O0-NEXT:    s_mov_b32 s39, s7
+; GFX9-O0-NEXT:    s_mov_b64 s[42:43], s[38:39]
+; GFX9-O0-NEXT:    s_mov_b64 s[40:41], s[36:37]
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s40, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s41, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s42, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s43, 5
+; GFX9-O0-NEXT:    s_mov_b32 s30, 0
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], off, s[36:39], s30
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s30
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s30
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[34:35], v0, s30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s30
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s4, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v5, s5, 7
-; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b64 s[30:31], exec
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s30, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s31, 7
+; GFX9-O0-NEXT:    s_and_b64 s[30:31], s[30:31], s[34:35]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[30:31]
 ; GFX9-O0-NEXT:    s_cbranch_execz BB1_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
@@ -189,100 +186,100 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[30:31], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[30:31]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[30:31], -1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[30:31]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:  BB1_2: ; %merge
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v5, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v5, 7
-; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v5, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v5, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s8, v5, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s9, v5, 3
-; GFX9-O0-NEXT:    v_readlane_b32 s10, v5, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s11, v5, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v5, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v5, 7
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v5, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v5, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v5, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v5, 5
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, v3
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
-; GFX9-O0-NEXT:    s_mov_b32 s6, 1
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
-; GFX9-O0-NEXT:    s_mov_b32 s6, 2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[34:35], v0, v3
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX9-O0-NEXT:    s_mov_b32 s34, 1
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s34, v0
+; GFX9-O0-NEXT:    s_mov_b32 s34, 2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s34
 ; GFX9-O0-NEXT:    v_and_b32_e32 v0, v0, v3
-; GFX9-O0-NEXT:    s_mov_b32 s6, 0
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[8:11], s6 offset:4
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    s_mov_b32 s34, 0
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    s_setpc_b64 s[4:5]
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_cfg:
 ; GFX9-O3:       ; %bb.0: ; %entry
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-O3-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX9-O3-NEXT:    s_and_saveexec_b64 s[34:35], vcc
 ; GFX9-O3-NEXT:  ; %bb.1: ; %if
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v3, v1
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9-O3-NEXT:  ; %bb.2: ; %merge
-; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-O3-NEXT:    v_and_b32_e32 v0, 2, v0
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -343,92 +340,89 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
 ; GFX9-O0-LABEL: strict_wwm_call:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    v_writelane_b32 v3, s33, 2
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
 ; GFX9-O0-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-O0-NEXT:    s_mov_b32 s9, s8
-; GFX9-O0-NEXT:    s_mov_b32 s8, s7
-; GFX9-O0-NEXT:    s_mov_b32 s10, s6
-; GFX9-O0-NEXT:    s_mov_b32 s11, s5
-; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s5, s11
-; GFX9-O0-NEXT:    s_mov_b32 s6, s10
-; GFX9-O0-NEXT:    s_mov_b32 s7, s8
-; GFX9-O0-NEXT:    ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15 killed $sgpr4_sgpr5_sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s8, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-O0-NEXT:    s_mov_b32 s36, s4
+; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
+; GFX9-O0-NEXT:    s_mov_b32 s37, s5
+; GFX9-O0-NEXT:    s_mov_b32 s38, s6
+; GFX9-O0-NEXT:    s_mov_b32 s39, s7
+; GFX9-O0-NEXT:    ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39
+; GFX9-O0-NEXT:    s_mov_b32 s34, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    s_getpc_b64 s[12:13]
-; GFX9-O0-NEXT:    s_add_u32 s12, s12, strict_wwm_called at rel32@lo+4
-; GFX9-O0-NEXT:    s_addc_u32 s13, s13, strict_wwm_called at rel32@hi+12
-; GFX9-O0-NEXT:    s_mov_b64 s[18:19], s[2:3]
-; GFX9-O0-NEXT:    s_mov_b64 s[16:17], s[0:1]
-; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[16:17]
-; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[18:19]
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-O0-NEXT:    s_add_u32 s30, s30, strict_wwm_called at rel32@lo+4
+; GFX9-O0-NEXT:    s_addc_u32 s31, s31, strict_wwm_called at rel32@hi+12
+; GFX9-O0-NEXT:    s_mov_b64 s[46:47], s[2:3]
+; GFX9-O0-NEXT:    s_mov_b64 s[44:45], s[0:1]
+; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[44:45]
+; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[46:47]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[4:7], s8 offset:4
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; GFX9-O0-NEXT:    v_readlane_b32 s33, v3, 2
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_call:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX9-O3-NEXT:    s_mov_b32 s14, s33
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b32 s38, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-O3-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX9-O3-NEXT:    s_mov_b64 s[36:37], s[30:31]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O3-NEXT:    s_getpc_b64 s[12:13]
-; GFX9-O3-NEXT:    s_add_u32 s12, s12, strict_wwm_called at rel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s13, s13, strict_wwm_called at rel32@hi+12
-; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; GFX9-O3-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-O3-NEXT:    s_add_u32 s30, s30, strict_wwm_called at rel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s31, s31, strict_wwm_called at rel32@hi+12
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-O3-NEXT:    s_mov_b32 s33, s14
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    s_mov_b32 s33, s38
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[30:31], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[30:31]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    s_setpc_b64 s[10:11]
+; GFX9-O3-NEXT:    s_setpc_b64 s[36:37]
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
   %tmp134 = call amdgpu_gfx i32 @strict_wwm_called(i32 %tmp107)
   %tmp136 = add i32 %tmp134, %tmp107
@@ -449,32 +443,32 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v4, s[4:5], v2, v3
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v1, s[4:5]
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v4, s[34:35], v2, v3
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v0, s[34:35], v0, v1, s[34:35]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-O0-NEXT:    s_mov_b32 s4, 32
+; GFX9-O0-NEXT:    s_mov_b32 s34, 32
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s34, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mul_hi_u32 v1, v0, v6
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[7:8], s4, v[4:5]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[7:8], s34, v[4:5]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v3, v6
 ; GFX9-O0-NEXT:    v_add3_u32 v1, v1, v2, v3
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[1:2], s4, v[1:2]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[1:2], s34, v[1:2]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mul_lo_u32 v6, v0, v6
-; GFX9-O0-NEXT:    s_mov_b32 s5, 0
+; GFX9-O0-NEXT:    s_mov_b32 s35, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
@@ -489,12 +483,12 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT:    v_sub_co_u32_e64 v1, s[6:7], v1, v3
-; GFX9-O0-NEXT:    v_subb_co_u32_e64 v0, s[6:7], v0, v2, s[6:7]
+; GFX9-O0-NEXT:    v_sub_co_u32_e64 v1, s[36:37], v1, v3
+; GFX9-O0-NEXT:    v_subb_co_u32_e64 v0, s[36:37], v0, v2, s[36:37]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[1:2], s4, v[1:2]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[1:2], s34, v[1:2]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -521,7 +515,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -536,78 +530,75 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s33, 9
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s33, 8
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xc00
 ; GFX9-O0-NEXT:    v_writelane_b32 v11, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v11, s31, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s9, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s8, 3
-; GFX9-O0-NEXT:    s_mov_b32 s8, s6
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v11, 3
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s8, 4
-; GFX9-O0-NEXT:    s_mov_b32 s12, s5
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v11, 4
-; GFX9-O0-NEXT:    s_mov_b32 s8, s4
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v11, 2
-; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
-; GFX9-O0-NEXT:    s_mov_b32 s9, s12
-; GFX9-O0-NEXT:    s_mov_b32 s10, s5
-; GFX9-O0-NEXT:    s_mov_b32 s11, s7
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s8, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s9, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s10, 7
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s11, 8
-; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s7, s4
-; GFX9-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-O0-NEXT:    s_mov_b32 s34, s8
+; GFX9-O0-NEXT:    s_mov_b32 s36, s4
+; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
+; GFX9-O0-NEXT:    s_mov_b32 s37, s5
+; GFX9-O0-NEXT:    s_mov_b32 s38, s6
+; GFX9-O0-NEXT:    s_mov_b32 s39, s7
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s36, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s37, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s38, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s39, 5
+; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
+; GFX9-O0-NEXT:    s_mov_b32 s35, s9
+; GFX9-O0-NEXT:    ; kill: def $sgpr30_sgpr31 killed $sgpr34_sgpr35
+; GFX9-O0-NEXT:    s_mov_b64 s[30:31], 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s35
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s30
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s31
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[30:31], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s30, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s31, 7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
-; GFX9-O0-NEXT:    s_mov_b32 s4, 32
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s4, v[9:10]
-; GFX9-O0-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-O0-NEXT:    s_add_u32 s4, s4, strict_wwm_called_i64 at gotpcrel32@lo+4
-; GFX9-O0-NEXT:    s_addc_u32 s5, s5, strict_wwm_called_i64 at gotpcrel32@hi+12
-; GFX9-O0-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[2:3]
-; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[0:1]
-; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[12:13]
-; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[14:15]
+; GFX9-O0-NEXT:    s_mov_b32 s30, 32
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s30, v[9:10]
+; GFX9-O0-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-O0-NEXT:    s_add_u32 s30, s30, strict_wwm_called_i64 at gotpcrel32@lo+4
+; GFX9-O0-NEXT:    s_addc_u32 s31, s31, strict_wwm_called_i64 at gotpcrel32@hi+12
+; GFX9-O0-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
+; GFX9-O0-NEXT:    s_mov_b64 s[38:39], s[2:3]
+; GFX9-O0-NEXT:    s_mov_b64 s[36:37], s[0:1]
+; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v11, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v11, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v11, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v11, 8
+; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v11, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v11, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v11, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v11, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v11, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v11, 5
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v11, 0
 ; GFX9-O0-NEXT:    v_readlane_b32 s31, v11, 1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[10:11], v2, v4
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v3, v5, s[10:11]
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[40:41], v2, v4
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_mov_b32 s8, 0
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], s8 offset:4
+; GFX9-O0-NEXT:    s_mov_b32 s34, 0
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff400
-; GFX9-O0-NEXT:    v_readlane_b32 s33, v11, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    v_readlane_b32 s33, v11, 8
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
@@ -628,14 +619,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_call_i64:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -644,37 +635,37 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX9-O3-NEXT:    s_mov_b32 s14, s33
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b32 s38, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-O3-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX9-O3-NEXT:    s_mov_b64 s[36:37], s[30:31]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GFX9-O3-NEXT:    s_getpc_b64 s[12:13]
-; GFX9-O3-NEXT:    s_add_u32 s12, s12, strict_wwm_called_i64 at gotpcrel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s13, s13, strict_wwm_called_i64 at gotpcrel32@hi+12
-; GFX9-O3-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-O3-NEXT:    s_add_u32 s30, s30, strict_wwm_called_i64 at gotpcrel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s31, s31, strict_wwm_called_i64 at gotpcrel32@hi+12
+; GFX9-O3-NEXT:    s_load_dwordx2 s[30:31], s[30:31], 0x0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
 ; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xf800
-; GFX9-O3-NEXT:    s_mov_b32 s33, s14
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    s_mov_b32 s33, s38
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[30:31], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -685,9 +676,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[30:31]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    s_setpc_b64 s[10:11]
+; GFX9-O3-NEXT:    s_setpc_b64 s[36:37]
   %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
   %tmp134 = call amdgpu_gfx i64 @strict_wwm_called_i64(i64 %tmp107)
   %tmp136 = add i64 %tmp134, %tmp107
@@ -701,38 +692,36 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-LABEL: strict_wwm_amdgpu_cs_main:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX9-O0-NEXT:    s_mov_b32 s8, s7
-; GFX9-O0-NEXT:    s_mov_b32 s9, s6
-; GFX9-O0-NEXT:    s_mov_b32 s10, s5
-; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s5, s10
-; GFX9-O0-NEXT:    s_mov_b32 s6, s9
-; GFX9-O0-NEXT:    s_mov_b32 s7, s8
-; GFX9-O0-NEXT:    ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s8, 5
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s8, v0
-; GFX9-O0-NEXT:    s_mov_b32 s8, 0
-; GFX9-O0-NEXT:    buffer_load_dwordx4 v[10:13], v0, s[4:7], s8 offen
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], v0, s[4:7], s8 offen offset:16
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    s_mov_b32 s36, s4
+; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
+; GFX9-O0-NEXT:    s_mov_b32 s37, s5
+; GFX9-O0-NEXT:    s_mov_b32 s38, s6
+; GFX9-O0-NEXT:    s_mov_b32 s39, s7
+; GFX9-O0-NEXT:    ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39
+; GFX9-O0-NEXT:    s_mov_b32 s34, 5
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s34, v0
+; GFX9-O0-NEXT:    s_mov_b32 s34, 0
+; GFX9-O0-NEXT:    buffer_load_dwordx4 v[10:13], v0, s[36:39], s34 offen
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-O0-NEXT:    s_mov_b32 s9, 0x7fffffff
-; GFX9-O0-NEXT:    s_mov_b32 s10, -1
-; GFX9-O0-NEXT:    ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
-; GFX9-O0-NEXT:    s_mov_b32 s11, s9
+; GFX9-O0-NEXT:    s_mov_b32 s35, 0x7fffffff
+; GFX9-O0-NEXT:    s_mov_b32 s40, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
+; GFX9-O0-NEXT:    s_mov_b32 s41, s35
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s40
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s41
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
@@ -743,8 +732,8 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s40
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s41
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
@@ -752,8 +741,8 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s40
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s41
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
@@ -765,20 +754,20 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    buffer_store_dwordx4 v[5:8], v0, s[4:7], s8 offen
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[4:7], s8 offen offset:16
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O0-NEXT:    buffer_store_dwordx4 v[5:8], v0, s[36:39], s34 offen
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -787,25 +776,25 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX9-O3-NEXT:    buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen
 ; GFX9-O3-NEXT:    buffer_load_dwordx2 v[5:6], v0, s[4:7], 0 offen offset:16
-; GFX9-O3-NEXT:    s_mov_b32 s8, -1
-; GFX9-O3-NEXT:    s_brev_b32 s9, -2
+; GFX9-O3-NEXT:    s_mov_b32 s34, -1
+; GFX9-O3-NEXT:    s_brev_b32 s35, -2
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s34
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s35
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s34
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s35
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s9
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s34
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s35
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
@@ -815,7 +804,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
 ; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -827,7 +816,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %tmp17 = shl i32 %index, 5


        


More information about the llvm-commits mailing list