[llvm] [WIP][AMDGPU] Improve the handling of `inreg` arguments (PR #133614)

Sun Feb 22 10:22:36 PST 2026

https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/133614

>From 043ec2a4ef82cab196113482fa7b8babe4fe567c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Sun, 22 Feb 2026 11:05:07 -0500
Subject: [PATCH 1/3] [NFC][AMDGPU] Add test showing caller/callee SGPR
 mismatch for inreg args

Add a test demonstrating a bug where the caller and callee disagree on which
SGPRs hold user inreg arguments when there are enough to reach the SGPR0-3
range.

On the callee side, `LowerFormalArguments` marks SGPR0-3 as allocated in
`CCState` before the CC analysis runs. On the caller side, `LowerCall` adds the
scratch resource to `RegsToPass` without marking SGPR0-3 in `CCState`. This
causes `CC_AMDGPU_Func` to assign user inreg args to SGPR0-3 on the caller side
(they appear free) while the callee skips them.

In the test, the caller writes arg 0 (value 42) to s0, but the callee reads arg
0 from s16.
---
 .../AMDGPU/cc-inreg-sgpr0-3-mismatch.ll       | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll

diff --git a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
new file mode 100644
index 0000000000000..d0d2d65f7edf8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Test for a caller/callee mismatch in SGPR assignment for inreg args.
+;
+; The following test case demonstrates the bug with a function that takes 32
+; inreg i32 arguments and returns the first one. In the callee, arg 0 is read
+; from s16 (v_mov_b32_e32 v0, s16). In the caller, arg 0 (value 42) is written
+; to s0 (s_mov_b32 s0, 42). Since s0 != s16, the callee does not see the value
+; the caller intended to pass.
+;
+; The root cause is that the scratch resource descriptor occupies SGPR0-3. On
+; the callee side, LowerFormalArguments marks SGPR0-3 as allocated in CCState
+; before running the CC analysis, so CC_AMDGPU_Func skips them. On the caller
+; side, LowerCall adds the scratch resource to RegsToPass without marking
+; SGPR0-3 in CCState, so CC_AMDGPU_Func treats them as available and assigns
+; user inreg args there.
+
+define i32 @callee_returns_arg0(
+; CHECK-LABEL: callee_returns_arg0:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+    i32 inreg %a0,  i32 inreg %a1,  i32 inreg %a2,  i32 inreg %a3,
+    i32 inreg %a4,  i32 inreg %a5,  i32 inreg %a6,  i32 inreg %a7,
+    i32 inreg %a8,  i32 inreg %a9,  i32 inreg %a10, i32 inreg %a11,
+    i32 inreg %a12, i32 inreg %a13, i32 inreg %a14, i32 inreg %a15,
+    i32 inreg %a16, i32 inreg %a17, i32 inreg %a18, i32 inreg %a19,
+    i32 inreg %a20, i32 inreg %a21, i32 inreg %a22, i32 inreg %a23,
+    i32 inreg %a24, i32 inreg %a25, i32 inreg %a26, i32 inreg %a27,
+    i32 inreg %a28, i32 inreg %a29, i32 inreg %a30, i32 inreg %a31) {
+  ret i32 %a0
+}
+
+define i32 @caller_passes_42() {
+; CHECK-LABEL: caller_passes_42:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s42, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; CHECK-NEXT:    buffer_store_dword v14, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[40:41], s[16:17], 0x0
+; CHECK-NEXT:    v_writelane_b32 v14, s30, 0
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_mov_b32 s1, 1
+; CHECK-NEXT:    s_mov_b32 s2, 2
+; CHECK-NEXT:    s_mov_b32 s3, 3
+; CHECK-NEXT:    s_mov_b32 s16, 4
+; CHECK-NEXT:    s_mov_b32 s17, 5
+; CHECK-NEXT:    s_mov_b32 s18, 6
+; CHECK-NEXT:    s_mov_b32 s19, 7
+; CHECK-NEXT:    s_mov_b32 s20, 8
+; CHECK-NEXT:    s_mov_b32 s21, 9
+; CHECK-NEXT:    s_mov_b32 s22, 10
+; CHECK-NEXT:    s_mov_b32 s23, 11
+; CHECK-NEXT:    s_mov_b32 s24, 12
+; CHECK-NEXT:    s_mov_b32 s25, 13
+; CHECK-NEXT:    s_mov_b32 s26, 14
+; CHECK-NEXT:    s_mov_b32 s27, 15
+; CHECK-NEXT:    s_mov_b32 s28, 16
+; CHECK-NEXT:    s_mov_b32 s29, 17
+; CHECK-NEXT:    v_mov_b32_e32 v0, 18
+; CHECK-NEXT:    v_mov_b32_e32 v1, 19
+; CHECK-NEXT:    v_mov_b32_e32 v2, 20
+; CHECK-NEXT:    v_mov_b32_e32 v3, 21
+; CHECK-NEXT:    v_mov_b32_e32 v4, 22
+; CHECK-NEXT:    v_mov_b32_e32 v5, 23
+; CHECK-NEXT:    v_mov_b32_e32 v6, 24
+; CHECK-NEXT:    v_mov_b32_e32 v7, 25
+; CHECK-NEXT:    v_mov_b32_e32 v8, 26
+; CHECK-NEXT:    v_mov_b32_e32 v9, 27
+; CHECK-NEXT:    v_mov_b32_e32 v10, 28
+; CHECK-NEXT:    v_mov_b32_e32 v11, 29
+; CHECK-NEXT:    v_mov_b32_e32 v12, 30
+; CHECK-NEXT:    v_mov_b32_e32 v13, 31
+; CHECK-NEXT:    v_writelane_b32 v14, s31, 1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT:    v_readlane_b32 s31, v14, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v14, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b32 s33, s42
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call i32 @callee_returns_arg0(
+    i32 inreg 42, i32 inreg 1,  i32 inreg 2,  i32 inreg 3,
+    i32 inreg 4,  i32 inreg 5,  i32 inreg 6,  i32 inreg 7,
+    i32 inreg 8,  i32 inreg 9,  i32 inreg 10, i32 inreg 11,
+    i32 inreg 12, i32 inreg 13, i32 inreg 14, i32 inreg 15,
+    i32 inreg 16, i32 inreg 17, i32 inreg 18, i32 inreg 19,
+    i32 inreg 20, i32 inreg 21, i32 inreg 22, i32 inreg 23,
+    i32 inreg 24, i32 inreg 25, i32 inreg 26, i32 inreg 27,
+    i32 inreg 28, i32 inreg 29, i32 inreg 30, i32 inreg 31)
+  ret i32 %r
+}

>From 6a01b19ff3be0aea9707292bcd29951e57395ada Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Sun, 22 Feb 2026 11:50:49 -0500
Subject: [PATCH 2/3] [AMDGPU] Fix caller/callee mismatch in SGPR assignment
 for inreg args

On the callee side, `LowerFormalArguments` marks SGPR0-3 as allocated in
`CCState` before running the CC analysis. On the caller side, `LowerCall` (and
GlobalISel's `lowerCall`/`lowerTailCall`) added the scratch resource to
`RegsToPass` without marking it in `CCState`. This caused `CC_AMDGPU_Func` to
treat SGPR0-3 as available on the caller side, assigning user inreg args there,
while the callee skipped them without marking it in `CCState`. This caused
`CC_AMDGPU_Func` to treat SGPR0-3 as available on the caller side, assigning
user inreg args there, while the callee skipped them.
---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  12 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   5 +
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |  82 ++--
 .../CodeGen/AMDGPU/call-args-inreg-bfloat.ll  |   2 -
 ...l-args-inreg-no-sgpr-for-csrspill-xfail.ll |  86 +++-
 llvm/test/CodeGen/AMDGPU/call-args-inreg.ll   | 412 +++++++++++++++++-
 .../AMDGPU/cc-inreg-sgpr0-3-mismatch.ll       | 207 +++++++--
 .../CodeGen/AMDGPU/function-args-inreg.ll     |   3 -
 ...-call-inreg-arguments.convergencetokens.ll |   6 +-
 .../AMDGPU/tail-call-inreg-arguments.error.ll |  16 +-
 .../AMDGPU/tail-call-inreg-arguments.ll       |  12 +-
 11 files changed, 730 insertions(+), 113 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 5c6affdae0c5b..8e464c6168429 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1399,6 +1399,11 @@ bool AMDGPUCallLowering::lowerTailCall(
       return false;
   }
 
+  // Mark the scratch resource descriptor as allocated so the CC analysis
+  // does not assign user arguments to these registers, matching the callee.
+  if (!ST.hasFlatScratchEnabled())
+    CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+
   OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
 
   if (!determineAssignments(Assigner, OutArgs, CCInfo))
@@ -1598,6 +1603,13 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       return false;
   }
 
+  // Mark the scratch resource descriptor as allocated so the CC analysis
+  // does not assign user arguments to these registers, matching the callee.
+  if (!ST.hasFlatScratchEnabled()) {
+    const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+    CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+  }
+
   // Do the actual argument marshalling.
   OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
   if (!determineAssignments(Assigner, OutArgs, CCInfo))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8533b1bd06d90..bf88005ce8737 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4329,6 +4329,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
 
+  // Mark the scratch resource descriptor as allocated so the CC analysis
+  // does not assign user arguments to these registers, matching the callee.
+  if (!Subtarget->hasFlatScratchEnabled())
+    CCInfo.AllocateReg(Info->getScratchRSrcReg());
+
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
 
   // Get a count of how many bytes are to be pushed on the stack.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index c935310584949..2c1fd5800899a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -5378,7 +5378,7 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5390,7 +5390,7 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_i16_inreg(i16 inreg %arg)
@@ -5424,7 +5424,7 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5436,7 +5436,7 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_i32_inreg(i32 inreg %arg)
@@ -5473,9 +5473,9 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5487,7 +5487,7 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_i64_inreg(i64 inreg %arg)
@@ -5524,9 +5524,9 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5538,7 +5538,7 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2i32_inreg(<2 x i32> inreg %arg)
@@ -5574,7 +5574,7 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5586,7 +5586,7 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_f16_inreg(half inreg %arg)
@@ -5634,7 +5634,7 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5646,7 +5646,7 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_f32_inreg(float inreg %arg)
@@ -5683,9 +5683,9 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5697,7 +5697,7 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_f64_inreg(double inreg %arg)
@@ -5732,7 +5732,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5744,7 +5744,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2f16_inreg(<2 x half> inreg %arg)
@@ -5787,10 +5787,10 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
   ; CHECK-NEXT:   [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s16>)
   ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5802,7 +5802,7 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v3f16_inreg(<3 x half> inreg %arg)
@@ -5840,10 +5840,10 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>)
   ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5855,7 +5855,7 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v4f16_inreg(<4 x half> inreg %arg)
@@ -5892,9 +5892,9 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p0)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5906,7 +5906,7 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_p0_inreg(ptr inreg %arg)
@@ -5943,9 +5943,9 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p1)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5957,7 +5957,7 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p1_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p1_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
@@ -5992,7 +5992,7 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY9]](p3)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[PTRTOINT]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -6004,7 +6004,7 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p3_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p3_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
@@ -6045,13 +6045,13 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
   ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p1>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
-  ; CHECK-NEXT:   $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
+  ; CHECK-NEXT:   $sgpr18 = COPY [[INTRINSIC_CONVERGENT2]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
-  ; CHECK-NEXT:   $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32)
+  ; CHECK-NEXT:   $sgpr19 = COPY [[INTRINSIC_CONVERGENT3]](s32)
   ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY13]](p4)
@@ -6063,7 +6063,7 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY19]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY20]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY21]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p1_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p1_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg)
@@ -6100,9 +6100,9 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p5>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -6114,7 +6114,7 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p5_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p5_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
index 04e472419ca61..0511db2f02ff2 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
@@ -23,7 +23,6 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_bf16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_bf16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -83,7 +82,6 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2bf16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2bf16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
index 34f4476f7fd6a..bd03e092c0fa0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
@@ -1,22 +1,104 @@
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 < %s | FileCheck -enable-var-scope %s
 
-; CHECK: illegal VGPR to SGPR copy
+; This test previously hit "illegal VGPR to SGPR copy" because the caller CC
+; analysis assigned user inreg args to SGPR0-3, overlapping the scratch resource
+; descriptor. With the fix that marks SGPR0-3 as allocated before CC analysis,
+; the overlap is eliminated and these functions compile correctly.
 
 declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0
 declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0
 declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0
 
 define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 {
+; CHECK-LABEL: test_call_external_void_func_a15i32_inreg:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s40, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[42:43]
+; CHECK-NEXT:    v_writelane_b32 v40, s40, 2
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[40:41]
+; CHECK-NEXT:    s_add_u32 s40, s40, external_void_func_a15i32_inreg at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0)
   ret void
 }
 
 define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 {
+; CHECK-LABEL: test_call_external_void_func_a16i32_inreg:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s40, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[42:43]
+; CHECK-NEXT:    v_writelane_b32 v40, s40, 2
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[40:41]
+; CHECK-NEXT:    s_add_u32 s40, s40, external_void_func_a16i32_inreg at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s41, s41, external_void_func_a16i32_inreg at rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0)
   ret void
 }
 
 define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s40, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[42:43]
+; CHECK-NEXT:    v_writelane_b32 v40, s40, 2
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[40:41]
+; CHECK-NEXT:    s_add_u32 s40, s40, external_void_func_a15i32_inreg_i32_inreg at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s41, s41, external_void_func_a15i32_inreg_i32_inreg at rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index f96007ae513bd..c1b3278144d59 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -46,7 +46,6 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i8_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i8_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -106,7 +105,6 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -166,7 +164,6 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -212,6 +209,33 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_i64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -244,6 +268,33 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v2i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -276,6 +327,33 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s19, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s19, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v3i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -308,6 +386,33 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s20, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-NEXT:    v_writelane_b32 v40, s20, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v4i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -340,6 +445,33 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v8i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s24, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[26:27], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[26:27]
+; GFX9-NEXT:    v_writelane_b32 v40, s24, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[24:25]
+; GFX9-NEXT:    s_add_u32 s24, s24, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[24:25]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v8i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -386,7 +518,6 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -446,7 +577,6 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -492,6 +622,33 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_f64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -538,7 +695,6 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2f16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -584,6 +740,33 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 }
 
 define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v3f16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -616,6 +799,33 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
 }
 
 define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v4f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v4f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v4f16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v4f16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -648,6 +858,33 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
 }
 
 define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_p0_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p0_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p0_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_p0_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -680,6 +917,33 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_p1_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p1_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p1_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_p1_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -726,7 +990,6 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p3_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p3_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -772,6 +1035,33 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 }
 
 define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2p1_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s20, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-NEXT:    v_writelane_b32 v40, s20, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v2p1_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v2p1_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -804,6 +1094,33 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
 }
 
 define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2p5_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2p5_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2p5_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v2p5_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -836,6 +1153,33 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
 }
 
 define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg %arg1, i64 inreg %arg2) #0 {
+; GFX9-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s21, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-NEXT:    v_writelane_b32 v40, s21, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[22:23]
+; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -868,6 +1212,33 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
 }
 
 define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #0 {
+; GFX9-LABEL: test_call_external_void_func_a15i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s29, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[40:41]
+; GFX9-NEXT:    v_writelane_b32 v40, s29, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[40:41]
+; GFX9-NEXT:    s_add_u32 s40, s40, external_void_func_a15i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_a15i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -902,6 +1273,33 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
 
 ; FIXME: This should also fail
 define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inreg %arg0, i32 inreg %arg1) #1 {
+; GFX9-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s21, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-NEXT:    v_writelane_b32 v40, s21, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[22:23]
+; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
index d0d2d65f7edf8..86486e56d46ac 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GISEL %s
 
 ; Test for a caller/callee mismatch in SGPR assignment for inreg args.
 ;
@@ -22,6 +23,17 @@ define i32 @callee_returns_arg0(
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: callee_returns_arg0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: callee_returns_arg0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s16
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
     i32 inreg %a0,  i32 inreg %a1,  i32 inreg %a2,  i32 inreg %a3,
     i32 inreg %a4,  i32 inreg %a5,  i32 inreg %a6,  i32 inreg %a7,
     i32 inreg %a8,  i32 inreg %a9,  i32 inreg %a10, i32 inreg %a11,
@@ -40,58 +52,175 @@ define i32 @caller_passes_42() {
 ; CHECK-NEXT:    s_mov_b32 s42, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
-; CHECK-NEXT:    buffer_store_dword v14, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[40:41], s[16:17], 0x0
-; CHECK-NEXT:    v_writelane_b32 v14, s30, 0
-; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_mov_b32 s1, 1
-; CHECK-NEXT:    s_mov_b32 s2, 2
-; CHECK-NEXT:    s_mov_b32 s3, 3
-; CHECK-NEXT:    s_mov_b32 s16, 4
-; CHECK-NEXT:    s_mov_b32 s17, 5
-; CHECK-NEXT:    s_mov_b32 s18, 6
-; CHECK-NEXT:    s_mov_b32 s19, 7
-; CHECK-NEXT:    s_mov_b32 s20, 8
-; CHECK-NEXT:    s_mov_b32 s21, 9
-; CHECK-NEXT:    s_mov_b32 s22, 10
-; CHECK-NEXT:    s_mov_b32 s23, 11
-; CHECK-NEXT:    s_mov_b32 s24, 12
-; CHECK-NEXT:    s_mov_b32 s25, 13
-; CHECK-NEXT:    s_mov_b32 s26, 14
-; CHECK-NEXT:    s_mov_b32 s27, 15
-; CHECK-NEXT:    s_mov_b32 s28, 16
-; CHECK-NEXT:    s_mov_b32 s29, 17
-; CHECK-NEXT:    v_mov_b32_e32 v0, 18
-; CHECK-NEXT:    v_mov_b32_e32 v1, 19
-; CHECK-NEXT:    v_mov_b32_e32 v2, 20
-; CHECK-NEXT:    v_mov_b32_e32 v3, 21
-; CHECK-NEXT:    v_mov_b32_e32 v4, 22
-; CHECK-NEXT:    v_mov_b32_e32 v5, 23
-; CHECK-NEXT:    v_mov_b32_e32 v6, 24
-; CHECK-NEXT:    v_mov_b32_e32 v7, 25
-; CHECK-NEXT:    v_mov_b32_e32 v8, 26
-; CHECK-NEXT:    v_mov_b32_e32 v9, 27
-; CHECK-NEXT:    v_mov_b32_e32 v10, 28
-; CHECK-NEXT:    v_mov_b32_e32 v11, 29
-; CHECK-NEXT:    v_mov_b32_e32 v12, 30
-; CHECK-NEXT:    v_mov_b32_e32 v13, 31
-; CHECK-NEXT:    v_writelane_b32 v14, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v18, s30, 0
+; CHECK-NEXT:    s_mov_b32 s16, 42
+; CHECK-NEXT:    s_mov_b32 s17, 1
+; CHECK-NEXT:    s_mov_b32 s18, 2
+; CHECK-NEXT:    s_mov_b32 s19, 3
+; CHECK-NEXT:    s_mov_b32 s20, 4
+; CHECK-NEXT:    s_mov_b32 s21, 5
+; CHECK-NEXT:    s_mov_b32 s22, 6
+; CHECK-NEXT:    s_mov_b32 s23, 7
+; CHECK-NEXT:    s_mov_b32 s24, 8
+; CHECK-NEXT:    s_mov_b32 s25, 9
+; CHECK-NEXT:    s_mov_b32 s26, 10
+; CHECK-NEXT:    s_mov_b32 s27, 11
+; CHECK-NEXT:    s_mov_b32 s28, 12
+; CHECK-NEXT:    s_mov_b32 s29, 13
+; CHECK-NEXT:    v_mov_b32_e32 v0, 14
+; CHECK-NEXT:    v_mov_b32_e32 v1, 15
+; CHECK-NEXT:    v_mov_b32_e32 v2, 16
+; CHECK-NEXT:    v_mov_b32_e32 v3, 17
+; CHECK-NEXT:    v_mov_b32_e32 v4, 18
+; CHECK-NEXT:    v_mov_b32_e32 v5, 19
+; CHECK-NEXT:    v_mov_b32_e32 v6, 20
+; CHECK-NEXT:    v_mov_b32_e32 v7, 21
+; CHECK-NEXT:    v_mov_b32_e32 v8, 22
+; CHECK-NEXT:    v_mov_b32_e32 v9, 23
+; CHECK-NEXT:    v_mov_b32_e32 v10, 24
+; CHECK-NEXT:    v_mov_b32_e32 v11, 25
+; CHECK-NEXT:    v_mov_b32_e32 v12, 26
+; CHECK-NEXT:    v_mov_b32_e32 v13, 27
+; CHECK-NEXT:    v_mov_b32_e32 v14, 28
+; CHECK-NEXT:    v_mov_b32_e32 v15, 29
+; CHECK-NEXT:    v_mov_b32_e32 v16, 30
+; CHECK-NEXT:    v_mov_b32_e32 v17, 31
+; CHECK-NEXT:    v_writelane_b32 v18, s31, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT:    v_readlane_b32 s31, v14, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v14, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v18, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v18, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_mov_b32 s33, s42
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: caller_passes_42:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s42, s33
+; SDAG-NEXT:    s_mov_b32 s33, s32
+; SDAG-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; SDAG-NEXT:    buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
+; SDAG-NEXT:    s_mov_b64 exec, s[16:17]
+; SDAG-NEXT:    s_addk_i32 s32, 0x400
+; SDAG-NEXT:    s_getpc_b64 s[16:17]
+; SDAG-NEXT:    s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
+; SDAG-NEXT:    s_load_dwordx2 s[40:41], s[16:17], 0x0
+; SDAG-NEXT:    v_writelane_b32 v18, s30, 0
+; SDAG-NEXT:    s_mov_b32 s16, 42
+; SDAG-NEXT:    s_mov_b32 s17, 1
+; SDAG-NEXT:    s_mov_b32 s18, 2
+; SDAG-NEXT:    s_mov_b32 s19, 3
+; SDAG-NEXT:    s_mov_b32 s20, 4
+; SDAG-NEXT:    s_mov_b32 s21, 5
+; SDAG-NEXT:    s_mov_b32 s22, 6
+; SDAG-NEXT:    s_mov_b32 s23, 7
+; SDAG-NEXT:    s_mov_b32 s24, 8
+; SDAG-NEXT:    s_mov_b32 s25, 9
+; SDAG-NEXT:    s_mov_b32 s26, 10
+; SDAG-NEXT:    s_mov_b32 s27, 11
+; SDAG-NEXT:    s_mov_b32 s28, 12
+; SDAG-NEXT:    s_mov_b32 s29, 13
+; SDAG-NEXT:    v_mov_b32_e32 v0, 14
+; SDAG-NEXT:    v_mov_b32_e32 v1, 15
+; SDAG-NEXT:    v_mov_b32_e32 v2, 16
+; SDAG-NEXT:    v_mov_b32_e32 v3, 17
+; SDAG-NEXT:    v_mov_b32_e32 v4, 18
+; SDAG-NEXT:    v_mov_b32_e32 v5, 19
+; SDAG-NEXT:    v_mov_b32_e32 v6, 20
+; SDAG-NEXT:    v_mov_b32_e32 v7, 21
+; SDAG-NEXT:    v_mov_b32_e32 v8, 22
+; SDAG-NEXT:    v_mov_b32_e32 v9, 23
+; SDAG-NEXT:    v_mov_b32_e32 v10, 24
+; SDAG-NEXT:    v_mov_b32_e32 v11, 25
+; SDAG-NEXT:    v_mov_b32_e32 v12, 26
+; SDAG-NEXT:    v_mov_b32_e32 v13, 27
+; SDAG-NEXT:    v_mov_b32_e32 v14, 28
+; SDAG-NEXT:    v_mov_b32_e32 v15, 29
+; SDAG-NEXT:    v_mov_b32_e32 v16, 30
+; SDAG-NEXT:    v_mov_b32_e32 v17, 31
+; SDAG-NEXT:    v_writelane_b32 v18, s31, 1
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; SDAG-NEXT:    v_readlane_b32 s31, v18, 1
+; SDAG-NEXT:    v_readlane_b32 s30, v18, 0
+; SDAG-NEXT:    s_mov_b32 s32, s33
+; SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SDAG-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
+; SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; SDAG-NEXT:    s_mov_b32 s33, s42
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: caller_passes_42:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s42, s33
+; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; GISEL-NEXT:    buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
+; GISEL-NEXT:    s_mov_b64 exec, s[16:17]
+; GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GISEL-NEXT:    s_getpc_b64 s[16:17]
+; GISEL-NEXT:    s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
+; GISEL-NEXT:    s_load_dwordx2 s[40:41], s[16:17], 0x0
+; GISEL-NEXT:    v_writelane_b32 v18, s30, 0
+; GISEL-NEXT:    s_mov_b32 s16, 42
+; GISEL-NEXT:    s_mov_b32 s17, 1
+; GISEL-NEXT:    s_mov_b32 s18, 2
+; GISEL-NEXT:    s_mov_b32 s19, 3
+; GISEL-NEXT:    s_mov_b32 s20, 4
+; GISEL-NEXT:    s_mov_b32 s21, 5
+; GISEL-NEXT:    s_mov_b32 s22, 6
+; GISEL-NEXT:    s_mov_b32 s23, 7
+; GISEL-NEXT:    s_mov_b32 s24, 8
+; GISEL-NEXT:    s_mov_b32 s25, 9
+; GISEL-NEXT:    s_mov_b32 s26, 10
+; GISEL-NEXT:    s_mov_b32 s27, 11
+; GISEL-NEXT:    s_mov_b32 s28, 12
+; GISEL-NEXT:    s_mov_b32 s29, 13
+; GISEL-NEXT:    v_mov_b32_e32 v0, 14
+; GISEL-NEXT:    v_mov_b32_e32 v1, 15
+; GISEL-NEXT:    v_mov_b32_e32 v2, 16
+; GISEL-NEXT:    v_mov_b32_e32 v3, 17
+; GISEL-NEXT:    v_mov_b32_e32 v4, 18
+; GISEL-NEXT:    v_mov_b32_e32 v5, 19
+; GISEL-NEXT:    v_mov_b32_e32 v6, 20
+; GISEL-NEXT:    v_mov_b32_e32 v7, 21
+; GISEL-NEXT:    v_mov_b32_e32 v8, 22
+; GISEL-NEXT:    v_mov_b32_e32 v9, 23
+; GISEL-NEXT:    v_mov_b32_e32 v10, 24
+; GISEL-NEXT:    v_mov_b32_e32 v11, 25
+; GISEL-NEXT:    v_mov_b32_e32 v12, 26
+; GISEL-NEXT:    v_mov_b32_e32 v13, 27
+; GISEL-NEXT:    v_mov_b32_e32 v14, 28
+; GISEL-NEXT:    v_mov_b32_e32 v15, 29
+; GISEL-NEXT:    v_mov_b32_e32 v16, 30
+; GISEL-NEXT:    v_mov_b32_e32 v17, 31
+; GISEL-NEXT:    v_writelane_b32 v18, s31, 1
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; GISEL-NEXT:    v_readlane_b32 s31, v18, 1
+; GISEL-NEXT:    v_readlane_b32 s30, v18, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
+; GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GISEL-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL-NEXT:    s_mov_b32 s33, s42
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @callee_returns_arg0(
     i32 inreg 42, i32 inreg 1,  i32 inreg 2,  i32 inreg 3,
     i32 inreg 4,  i32 inreg 5,  i32 inreg 6,  i32 inreg 7,
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 831d10480c51c..0bf987ab6f19c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1734,9 +1734,6 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ; GFX9-NEXT:    s_load_dwordx2 s[20:21], s[20:21], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s2, s18
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll
index 41e7ad7b8e2fa..86275a8444c45 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll
@@ -43,10 +43,10 @@ define void @tail_call_i64_inreg_uniform_in_vgpr_convergence_tokens() #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY2]]
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY1]]
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY]]
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK-NEXT:   $sgpr16 = COPY [[V_READFIRSTLANE_B32_]]
+  ; CHECK-NEXT:   $sgpr17 = COPY [[V_READFIRSTLANE_B32_1]]
   ; CHECK-NEXT:   CONVERGENCECTRL_GLUE [[CONVERGENCECTRL_ENTRY]]
-  ; CHECK-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @void_func_i64_inreg, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0, implicit $sgpr1, implicit [[CONVERGENCECTRL_ENTRY]]
+  ; CHECK-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @void_func_i64_inreg, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr16, implicit $sgpr17, implicit [[CONVERGENCECTRL_ENTRY]]
   %t = call token @llvm.experimental.convergence.entry()
   %uniform.vgpr = load i64, ptr addrspace(3) zeroinitializer, align 8
   tail call void @void_func_i64_inreg(i64 inreg %uniform.vgpr) #0 [ "convergencectrl"(token %t) ]
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
index 242b5e9aeaf42..cb34262e2d4ee 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
@@ -21,11 +21,11 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) {
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, void_func_i32_inreg at rel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, void_func_i32_inreg at rel32@hi+12
-; CHECK-NEXT:     ; illegal copy v0 to s0
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    s_getpc_b64 s[18:19]
+; CHECK-NEXT:    s_add_u32 s18, s18, void_func_i32_inreg at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s19, s19, void_func_i32_inreg at rel32@hi+12
+; CHECK-NEXT:     ; illegal copy v0 to s16
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
@@ -56,12 +56,12 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) {
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, constant at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, constant at rel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:     ; illegal copy v0 to s0
+; CHECK-NEXT:     ; illegal copy v0 to s16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
index 5506b29b99895..2449812e61661 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
@@ -10,7 +10,6 @@ define void @tail_call_i32_inreg_uniform(i32 inreg %sgpr) {
 ; CHECK-NEXT:    s_getpc_b64 s[18:19]
 ; CHECK-NEXT:    s_add_u32 s18, s18, void_func_i32_inreg at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s19, s19, void_func_i32_inreg at rel32@hi+12
-; CHECK-NEXT:    s_mov_b32 s0, s16
 ; CHECK-NEXT:    s_setpc_b64 s[18:19]
   tail call void @void_func_i32_inreg(i32 inreg %sgpr)
   ret void
@@ -26,7 +25,6 @@ define void @indirect_tail_call_i32_inreg_uniform(i32 inreg %sgpr) {
 ; CHECK-NEXT:    s_add_u32 s18, s18, constant at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s19, s19, constant at rel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[18:19], 0x0
-; CHECK-NEXT:    s_mov_b32 s0, s16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[18:19]
   %fptr = load ptr, ptr addrspace(4) @constant, align 8
@@ -44,8 +42,6 @@ define void @tail_call_i64_inreg_uniform(i64 inreg %sgpr) {
 ; CHECK-NEXT:    s_add_u32 s18, s18, void_func_i64_inreg at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s19, s19, void_func_i64_inreg at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[18:19], 0x0
-; CHECK-NEXT:    s_mov_b32 s1, s17
-; CHECK-NEXT:    s_mov_b32 s0, s16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[18:19]
   tail call void @void_func_i64_inreg(i64 inreg %sgpr)
@@ -61,11 +57,11 @@ define void @tail_call_i64_inreg_uniform_in_vgpr() {
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, void_func_i64_inreg at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, void_func_i64_inreg at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-NEXT:    v_readfirstlane_b32 s1, v1
-; CHECK-NEXT:    s_setpc_b64 s[16:17]
+; CHECK-NEXT:    v_readfirstlane_b32 s16, v0
+; CHECK-NEXT:    v_readfirstlane_b32 s17, v1
+; CHECK-NEXT:    s_setpc_b64 s[18:19]
   %uniform.vgpr = load i64, ptr addrspace(3) zeroinitializer, align 8
   tail call void @void_func_i64_inreg(i64 inreg %uniform.vgpr)
   ret void

>From 7564f226b0b92e1e30cb31f1a5cec7673c18ab90 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Sun, 22 Feb 2026 13:20:10 -0500
Subject: [PATCH 3/3] [AMDGPU] Pack overflow inreg args into VGPR lanes

When inreg function arguments overflow the available SGPRs, pack multiple values
into lanes of a single VGPR using writelane/readlane instead of consuming one
VGPR per overflow argument.

The feature is behind a flag (default off) and currently only supports the
SelectionDAG path.

Known issue: if the register allocator does not coalesce the COPY between the
writelane chain and the physical call argument register, the resulting v_mov_b32
is EXEC-dependent and will not transfer inactive lanes. This is correct when
EXEC is all-ones (the common case at call sites) but would be incorrect inside
divergent control flow.
---
 .../lib/Target/AMDGPU/AMDGPULanePackedABI.cpp | 152 ++++++++++
 llvm/lib/Target/AMDGPU/AMDGPULanePackedABI.h  |  54 ++++
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  54 +++-
 .../CodeGen/AMDGPU/inreg-vgpr-lane-packing.ll | 282 ++++++++++++++++++
 5 files changed, 540 insertions(+), 3 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULanePackedABI.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULanePackedABI.h
 create mode 100644 llvm/test/CodeGen/AMDGPU/inreg-vgpr-lane-packing.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULanePackedABI.cpp b/llvm/lib/Target/AMDGPU/AMDGPULanePackedABI.cpp
new file mode 100644
index 0000000000000..24c21b76340ac
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULanePackedABI.cpp
@@ -0,0 +1,152 @@
+//===-- AMDGPULanePackedABI.cpp - Lane-packed inreg arg ABI ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULanePackedABI.h"
+#include "AMDGPU.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableInRegVGPRLanePacking(
+    "amdgpu-inreg-vgpr-lane-packing", cl::Hidden,
+    cl::desc(
+        "Pack overflow inreg args into VGPR lanes using writelane/readlane"),
+    cl::init(false));
+
+bool llvm::isInRegVGPRLanePackingEnabled() {
+  return EnableInRegVGPRLanePacking;
+}
+
+static const MCPhysReg SGPRList[] = {
+    AMDGPU::SGPR0,  AMDGPU::SGPR1,  AMDGPU::SGPR2,  AMDGPU::SGPR3,
+    AMDGPU::SGPR4,  AMDGPU::SGPR5,  AMDGPU::SGPR6,  AMDGPU::SGPR7,
+    AMDGPU::SGPR8,  AMDGPU::SGPR9,  AMDGPU::SGPR10, AMDGPU::SGPR11,
+    AMDGPU::SGPR12, AMDGPU::SGPR13, AMDGPU::SGPR14, AMDGPU::SGPR15,
+    AMDGPU::SGPR16, AMDGPU::SGPR17, AMDGPU::SGPR18, AMDGPU::SGPR19,
+    AMDGPU::SGPR20, AMDGPU::SGPR21, AMDGPU::SGPR22, AMDGPU::SGPR23,
+    AMDGPU::SGPR24, AMDGPU::SGPR25, AMDGPU::SGPR26, AMDGPU::SGPR27,
+    AMDGPU::SGPR28, AMDGPU::SGPR29};
+
+static const MCPhysReg VGPRList[] = {
+    AMDGPU::VGPR0,  AMDGPU::VGPR1,  AMDGPU::VGPR2,  AMDGPU::VGPR3,
+    AMDGPU::VGPR4,  AMDGPU::VGPR5,  AMDGPU::VGPR6,  AMDGPU::VGPR7,
+    AMDGPU::VGPR8,  AMDGPU::VGPR9,  AMDGPU::VGPR10, AMDGPU::VGPR11,
+    AMDGPU::VGPR12, AMDGPU::VGPR13, AMDGPU::VGPR14, AMDGPU::VGPR15,
+    AMDGPU::VGPR16, AMDGPU::VGPR17, AMDGPU::VGPR18, AMDGPU::VGPR19,
+    AMDGPU::VGPR20, AMDGPU::VGPR21, AMDGPU::VGPR22, AMDGPU::VGPR23,
+    AMDGPU::VGPR24, AMDGPU::VGPR25, AMDGPU::VGPR26, AMDGPU::VGPR27,
+    AMDGPU::VGPR28, AMDGPU::VGPR29, AMDGPU::VGPR30, AMDGPU::VGPR31};
+
+template <typename ArgT>
+void llvm::analyzeArgsWithLanePacking(CCState &State,
+                                      const SmallVectorImpl<ArgT> &Args,
+                                      bool IsWave32) {
+  MCPhysReg PackingVGPR = AMDGPU::NoRegister;
+  unsigned NextLane = 0;
+  unsigned LanesPerVGPR = IsWave32 ? 32 : 64;
+
+  for (unsigned I = 0, E = Args.size(); I != E; ++I) {
+    const ArgT &Arg = Args[I];
+    ISD::ArgFlagsTy Flags = Arg.Flags;
+    MVT VT = Arg.VT;
+
+    if (Flags.isByVal()) {
+      State.HandleByVal(I, VT, VT, CCValAssign::Full, 1, Align(4), Flags);
+      continue;
+    }
+
+    MVT LocVT = VT;
+    CCValAssign::LocInfo LocInfo = CCValAssign::Full;
+
+    if (VT == MVT::i1)
+      LocVT = MVT::i32;
+
+    if ((VT == MVT::i8 || VT == MVT::i16) && (Flags.isSExt() || Flags.isZExt()))
+      LocVT = MVT::i32;
+
+    auto AssignToStack = [&]() {
+      unsigned Offset = State.AllocateStack(4, Align(4));
+      State.addLoc(CCValAssign::getMem(I, VT, Offset, LocVT, LocInfo));
+    };
+
+    if (Flags.isInReg()) {
+      if (MCPhysReg Reg = State.AllocateReg(SGPRList)) {
+        State.addLoc(CCValAssign::getReg(I, VT, Reg, LocVT, LocInfo));
+        continue;
+      }
+
+      if (NextLane >= LanesPerVGPR || PackingVGPR == AMDGPU::NoRegister) {
+        PackingVGPR = State.AllocateReg(VGPRList);
+        NextLane = 0;
+        if (PackingVGPR == AMDGPU::NoRegister) {
+          AssignToStack();
+          continue;
+        }
+      }
+
+      State.addLoc(
+          CCValAssign::getCustomReg(I, VT, PackingVGPR, LocVT, LocInfo));
+      ++NextLane;
+      continue;
+    }
+
+    if (MCPhysReg Reg = State.AllocateReg(VGPRList)) {
+      State.addLoc(CCValAssign::getReg(I, VT, Reg, LocVT, LocInfo));
+      continue;
+    }
+
+    AssignToStack();
+  }
+}
+
+template void llvm::analyzeArgsWithLanePacking<ISD::InputArg>(
+    CCState &, const SmallVectorImpl<ISD::InputArg> &, bool);
+template void llvm::analyzeArgsWithLanePacking<ISD::OutputArg>(
+    CCState &, const SmallVectorImpl<ISD::OutputArg> &, bool);
+
+void llvm::packOverflowInRegToVGPRLanes(
+    SmallVectorImpl<CCValAssign> &ArgLocs,
+    const std::function<bool(unsigned)> &IsInReg, bool IsWave32) {
+  unsigned LanesPerVGPR = IsWave32 ? 32 : 64;
+  MCPhysReg PackingVGPR = AMDGPU::NoRegister;
+  unsigned NextLane = 0;
+
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    if (!VA.isRegLoc() || !IsInReg(I))
+      continue;
+    if (!AMDGPU::VGPR_32RegClass.contains(VA.getLocReg()))
+      continue;
+
+    if (PackingVGPR == AMDGPU::NoRegister || NextLane >= LanesPerVGPR) {
+      PackingVGPR = VA.getLocReg();
+      NextLane = 0;
+    }
+
+    ArgLocs[I] =
+        CCValAssign::getCustomReg(VA.getValNo(), VA.getValVT(), PackingVGPR,
+                                  VA.getLocVT(), VA.getLocInfo());
+    ++NextLane;
+  }
+}
+
+unsigned
+llvm::getLaneIndexForPackedArg(const SmallVectorImpl<CCValAssign> &Locs,
+                               unsigned Idx) {
+  assert(Locs[Idx].needsCustom() && Locs[Idx].isRegLoc());
+  MCPhysReg VGPR = Locs[Idx].getLocReg();
+  unsigned Lane = 0;
+  for (unsigned I = 0; I < Idx; ++I) {
+    if (Locs[I].needsCustom() && Locs[I].isRegLoc() &&
+        Locs[I].getLocReg() == VGPR)
+      ++Lane;
+  }
+  return Lane;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULanePackedABI.h b/llvm/lib/Target/AMDGPU/AMDGPULanePackedABI.h
new file mode 100644
index 0000000000000..c29c1228d1309
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULanePackedABI.h
@@ -0,0 +1,54 @@
+//===-- AMDGPULanePackedABI.h - Lane-packed inreg arg ABI -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Custom calling convention analysis that packs overflow inreg arguments
+/// into VGPR lanes using writelane/readlane instead of consuming one VGPR
+/// per overflow argument.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPULANEPACKEDABI_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPULANEPACKEDABI_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/TargetCallingConv.h"
+#include <functional>
+
+namespace llvm {
+
+/// Custom CC analysis that packs overflow inreg arguments into VGPR lanes.
+/// Inreg args are first assigned to SGPRs. When SGPRs are exhausted, overflow
+/// inreg args share VGPR lanes (up to 32 for wave32, 64 for wave64 per VGPR).
+/// Non-inreg args are assigned to individual VGPRs normally.
+///
+/// Lane-packed entries are marked with needsCustom() == true.
+template <typename ArgT>
+void analyzeArgsWithLanePacking(CCState &State,
+                                const SmallVectorImpl<ArgT> &Args,
+                                bool IsWave32);
+
+/// Post-process CC results: repack overflow inreg VGPR assignments into
+/// lane-packed entries. IsInReg returns true if the arg at the given index
+/// has the inreg flag. This is used by GlobalISel which doesn't use
+/// InputArg/OutputArg directly.
+void packOverflowInRegToVGPRLanes(SmallVectorImpl<CCValAssign> &ArgLocs,
+                                  const std::function<bool(unsigned)> &IsInReg,
+                                  bool IsWave32);
+
+/// Compute the lane index for a lane-packed CCValAssign entry.
+/// The lane index is the count of prior custom entries with the same VGPR.
+unsigned getLaneIndexForPackedArg(const SmallVectorImpl<CCValAssign> &Locs,
+                                  unsigned Idx);
+
+/// Returns true if the lane-packing feature flag is enabled.
+bool isInRegVGPRLanePackingEnabled();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPULANEPACKEDABI_H
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index ae684a58cfd26..f0eff14c4debb 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -68,6 +68,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUInstructionSelector.cpp
   AMDGPUISelDAGToDAG.cpp
   AMDGPUISelLowering.cpp
+  AMDGPULanePackedABI.cpp
   AMDGPULateCodeGenPrepare.cpp
   AMDGPULegalizerInfo.cpp
   AMDGPULibCalls.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bf88005ce8737..96d12e77d3afa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPULaneMaskUtils.h"
+#include "AMDGPULanePackedABI.h"
 #include "AMDGPUSelectionDAGInfo.h"
 #include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
@@ -3450,8 +3451,14 @@ SDValue SITargetLowering::LowerFormalArguments(
   }
 
   if (!IsKernel) {
-    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
-    CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+    if (isInRegVGPRLanePackingEnabled() &&
+        (CallConv == CallingConv::C || CallConv == CallingConv::Fast ||
+         CallConv == CallingConv::Cold)) {
+      analyzeArgsWithLanePacking(CCInfo, Splits, Subtarget->isWave32());
+    } else {
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
+      CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+    }
 
     // This assumes the registers are allocated by CCInfo in ascending order
     // with no gaps.
@@ -3635,6 +3642,19 @@ SDValue SITargetLowering::LowerFormalArguments(
 
     assert(VA.isRegLoc() && "Parameter must be in a register!");
 
+    if (VA.needsCustom()) {
+      Register Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::VGPR_32RegClass);
+      SDValue PackedVal = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i32);
+      unsigned Lane = getLaneIndexForPackedArg(ArgLocs, i);
+      SDValue Val = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+          DAG.getTargetConstant(Intrinsic::amdgcn_readlane, DL, MVT::i32),
+          PackedVal, DAG.getConstant(Lane, DL, MVT::i32));
+      Val = convertABITypeToValueType(DAG, Val, VA, DL);
+      InVals.push_back(Val);
+      continue;
+    }
+
     Register Reg = VA.getLocReg();
     const TargetRegisterClass *RC = nullptr;
     if (AMDGPU::VGPR_32RegClass.contains(Reg))
@@ -4334,7 +4354,13 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (!Subtarget->hasFlatScratchEnabled())
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
 
-  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
+  if (isInRegVGPRLanePackingEnabled() &&
+      (CallConv == CallingConv::C || CallConv == CallingConv::Fast ||
+       CallConv == CallingConv::Cold)) {
+    analyzeArgsWithLanePacking(CCInfo, Outs, Subtarget->isWave32());
+  } else {
+    CCInfo.AnalyzeCallOperands(Outs, AssignFn);
+  }
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getStackSize();
@@ -4379,6 +4405,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   MVT PtrVT = MVT::i32;
 
+  DenseMap<MCPhysReg, SmallVector<std::pair<unsigned, SDValue>, 4>>
+      LanePackGroups;
+
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -4407,6 +4436,12 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
       llvm_unreachable("Unknown loc info!");
     }
 
+    if (VA.needsCustom() && VA.isRegLoc()) {
+      unsigned Lane = getLaneIndexForPackedArg(ArgLocs, i);
+      LanePackGroups[VA.getLocReg()].push_back({Lane, Arg});
+      continue;
+    }
+
     if (VA.isRegLoc()) {
       RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
     } else {
@@ -4475,6 +4510,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     }
   }
 
+  // Build writelane chains for lane-packed overflow inreg args.
+  SDValue WritelaneID =
+      DAG.getTargetConstant(Intrinsic::amdgcn_writelane, DL, MVT::i32);
+  for (auto &[VGPR, Lanes] : LanePackGroups) {
+    SDValue Packed = DAG.getUNDEF(MVT::i32);
+    for (auto &[LaneIdx, ArgVal] : Lanes) {
+      Packed =
+          DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, WritelaneID,
+                      ArgVal, DAG.getConstant(LaneIdx, DL, MVT::i32), Packed);
+    }
+    RegsToPass.push_back({VGPR, Packed});
+  }
+
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
diff --git a/llvm/test/CodeGen/AMDGPU/inreg-vgpr-lane-packing.ll b/llvm/test/CodeGen/AMDGPU/inreg-vgpr-lane-packing.ll
new file mode 100644
index 0000000000000..b9df0a3e6fc7b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inreg-vgpr-lane-packing.ll
@@ -0,0 +1,282 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-inreg-vgpr-lane-packing < %s | FileCheck %s
+
+; Test that overflow inreg arguments are packed into VGPR lanes using
+; writelane/readlane instead of consuming individual VGPRs.
+;
+; Each test has a callee + caller pair so we can verify:
+; - Callee: overflow inreg args are extracted via v_readlane_b32 into SGPRs
+;   and used in s_ (SALU) instructions
+; - Caller: overflow inreg args are packed via v_writelane_b32 into the same
+;   VGPR and lane that the callee reads from
+
+; --- Test 1: SGPR arg + first overflow arg ---
+; Callee reads a0 from SGPR and a14 from VGPR lane 0, adds with s_add.
+; Caller writes 100 to the SGPR for a0 and 200 to VGPR lane 0 for a14.
+define i32 @callee_add_sgpr_and_overflow(
+; CHECK-LABEL: callee_add_sgpr_and_overflow:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 0
+; CHECK-NEXT:    s_add_i32 s16, s16, s4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+    i32 inreg %a0,  i32 inreg %a1,  i32 inreg %a2,  i32 inreg %a3,
+    i32 inreg %a4,  i32 inreg %a5,  i32 inreg %a6,  i32 inreg %a7,
+    i32 inreg %a8,  i32 inreg %a9,  i32 inreg %a10, i32 inreg %a11,
+    i32 inreg %a12, i32 inreg %a13, i32 inreg %a14, i32 inreg %a15,
+    i32 inreg %a16, i32 inreg %a17, i32 inreg %a18, i32 inreg %a19,
+    i32 inreg %a20, i32 inreg %a21, i32 inreg %a22, i32 inreg %a23,
+    i32 inreg %a24, i32 inreg %a25, i32 inreg %a26, i32 inreg %a27,
+    i32 inreg %a28, i32 inreg %a29, i32 inreg %a30, i32 inreg %a31) {
+  %sum = add i32 %a0, %a14
+  ret i32 %sum
+}
+
+define i32 @caller_add_sgpr_and_overflow() {
+; CHECK-LABEL: caller_add_sgpr_and_overflow:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s42, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
+; CHECK-NEXT:    s_movk_i32 s16, 0xc8
+; CHECK-NEXT:    v_writelane_b32 v0, s16, 0
+; CHECK-NEXT:    v_writelane_b32 v0, 15, 1
+; CHECK-NEXT:    v_writelane_b32 v0, 16, 2
+; CHECK-NEXT:    v_writelane_b32 v0, 17, 3
+; CHECK-NEXT:    v_writelane_b32 v0, 18, 4
+; CHECK-NEXT:    v_writelane_b32 v0, 19, 5
+; CHECK-NEXT:    v_writelane_b32 v0, 20, 6
+; CHECK-NEXT:    v_writelane_b32 v0, 21, 7
+; CHECK-NEXT:    v_writelane_b32 v0, 22, 8
+; CHECK-NEXT:    v_writelane_b32 v0, 23, 9
+; CHECK-NEXT:    v_writelane_b32 v0, 24, 10
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v0, 25, 11
+; CHECK-NEXT:    v_writelane_b32 v0, 26, 12
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, callee_add_sgpr_and_overflow at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, callee_add_sgpr_and_overflow at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v0, 27, 13
+; CHECK-NEXT:    s_load_dwordx2 s[40:41], s[16:17], 0x0
+; CHECK-NEXT:    v_writelane_b32 v0, 28, 14
+; CHECK-NEXT:    v_writelane_b32 v0, 29, 15
+; CHECK-NEXT:    v_writelane_b32 v0, 30, 16
+; CHECK-NEXT:    v_writelane_b32 v1, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v0, 31, 17
+; CHECK-NEXT:    s_movk_i32 s16, 0x64
+; CHECK-NEXT:    s_mov_b32 s17, 1
+; CHECK-NEXT:    s_mov_b32 s18, 2
+; CHECK-NEXT:    s_mov_b32 s19, 3
+; CHECK-NEXT:    s_mov_b32 s20, 4
+; CHECK-NEXT:    s_mov_b32 s21, 5
+; CHECK-NEXT:    s_mov_b32 s22, 6
+; CHECK-NEXT:    s_mov_b32 s23, 7
+; CHECK-NEXT:    s_mov_b32 s24, 8
+; CHECK-NEXT:    s_mov_b32 s25, 9
+; CHECK-NEXT:    s_mov_b32 s26, 10
+; CHECK-NEXT:    s_mov_b32 s27, 11
+; CHECK-NEXT:    s_mov_b32 s28, 12
+; CHECK-NEXT:    s_mov_b32 s29, 13
+; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b32 s33, s42
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call i32 @callee_add_sgpr_and_overflow(
+    i32 inreg 100, i32 inreg 1,  i32 inreg 2,  i32 inreg 3,
+    i32 inreg 4,   i32 inreg 5,  i32 inreg 6,  i32 inreg 7,
+    i32 inreg 8,   i32 inreg 9,  i32 inreg 10, i32 inreg 11,
+    i32 inreg 12,  i32 inreg 13, i32 inreg 200, i32 inreg 15,
+    i32 inreg 16,  i32 inreg 17, i32 inreg 18, i32 inreg 19,
+    i32 inreg 20,  i32 inreg 21, i32 inreg 22, i32 inreg 23,
+    i32 inreg 24,  i32 inreg 25, i32 inreg 26, i32 inreg 27,
+    i32 inreg 28,  i32 inreg 29, i32 inreg 30, i32 inreg 31)
+  ret i32 %r
+}
+
+; --- Test 2: two overflow args from different VGPR lanes ---
+; Callee reads a30 and a31 from high lanes of the same VGPR, adds with s_add.
+; Caller writes 42 and 58 to those lanes via writelane.
+define i32 @callee_add_two_overflow(
+; CHECK-LABEL: callee_add_two_overflow:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 16
+; CHECK-NEXT:    s_add_i32 s5, s5, s4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s5
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+    i32 inreg %a0,  i32 inreg %a1,  i32 inreg %a2,  i32 inreg %a3,
+    i32 inreg %a4,  i32 inreg %a5,  i32 inreg %a6,  i32 inreg %a7,
+    i32 inreg %a8,  i32 inreg %a9,  i32 inreg %a10, i32 inreg %a11,
+    i32 inreg %a12, i32 inreg %a13, i32 inreg %a14, i32 inreg %a15,
+    i32 inreg %a16, i32 inreg %a17, i32 inreg %a18, i32 inreg %a19,
+    i32 inreg %a20, i32 inreg %a21, i32 inreg %a22, i32 inreg %a23,
+    i32 inreg %a24, i32 inreg %a25, i32 inreg %a26, i32 inreg %a27,
+    i32 inreg %a28, i32 inreg %a29, i32 inreg %a30, i32 inreg %a31) {
+  %sum = add i32 %a30, %a31
+  ret i32 %sum
+}
+
+define i32 @caller_add_two_overflow() {
+; CHECK-LABEL: caller_add_two_overflow:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s42, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
+; CHECK-NEXT:    v_writelane_b32 v0, 14, 0
+; CHECK-NEXT:    v_writelane_b32 v0, 15, 1
+; CHECK-NEXT:    v_writelane_b32 v0, 16, 2
+; CHECK-NEXT:    v_writelane_b32 v0, 17, 3
+; CHECK-NEXT:    v_writelane_b32 v0, 18, 4
+; CHECK-NEXT:    v_writelane_b32 v0, 19, 5
+; CHECK-NEXT:    v_writelane_b32 v0, 20, 6
+; CHECK-NEXT:    v_writelane_b32 v0, 21, 7
+; CHECK-NEXT:    v_writelane_b32 v0, 22, 8
+; CHECK-NEXT:    v_writelane_b32 v0, 23, 9
+; CHECK-NEXT:    v_writelane_b32 v0, 24, 10
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v0, 25, 11
+; CHECK-NEXT:    v_writelane_b32 v0, 26, 12
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, callee_add_two_overflow at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, callee_add_two_overflow at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v0, 27, 13
+; CHECK-NEXT:    s_load_dwordx2 s[40:41], s[16:17], 0x0
+; CHECK-NEXT:    v_writelane_b32 v0, 28, 14
+; CHECK-NEXT:    v_writelane_b32 v0, 29, 15
+; CHECK-NEXT:    v_writelane_b32 v0, 42, 16
+; CHECK-NEXT:    v_writelane_b32 v1, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v0, 58, 17
+; CHECK-NEXT:    s_mov_b32 s16, 0
+; CHECK-NEXT:    s_mov_b32 s17, 1
+; CHECK-NEXT:    s_mov_b32 s18, 2
+; CHECK-NEXT:    s_mov_b32 s19, 3
+; CHECK-NEXT:    s_mov_b32 s20, 4
+; CHECK-NEXT:    s_mov_b32 s21, 5
+; CHECK-NEXT:    s_mov_b32 s22, 6
+; CHECK-NEXT:    s_mov_b32 s23, 7
+; CHECK-NEXT:    s_mov_b32 s24, 8
+; CHECK-NEXT:    s_mov_b32 s25, 9
+; CHECK-NEXT:    s_mov_b32 s26, 10
+; CHECK-NEXT:    s_mov_b32 s27, 11
+; CHECK-NEXT:    s_mov_b32 s28, 12
+; CHECK-NEXT:    s_mov_b32 s29, 13
+; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b32 s33, s42
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call i32 @callee_add_two_overflow(
+    i32 inreg 0,  i32 inreg 1,  i32 inreg 2,  i32 inreg 3,
+    i32 inreg 4,  i32 inreg 5,  i32 inreg 6,  i32 inreg 7,
+    i32 inreg 8,  i32 inreg 9,  i32 inreg 10, i32 inreg 11,
+    i32 inreg 12, i32 inreg 13, i32 inreg 14, i32 inreg 15,
+    i32 inreg 16, i32 inreg 17, i32 inreg 18, i32 inreg 19,
+    i32 inreg 20, i32 inreg 21, i32 inreg 22, i32 inreg 23,
+    i32 inreg 24, i32 inreg 25, i32 inreg 26, i32 inreg 27,
+    i32 inreg 28, i32 inreg 29, i32 inreg 42, i32 inreg 58)
+  ret i32 %r
+}
+
+; --- Test 3: mixed inreg and non-inreg ---
+; Callee: s0 (SGPR) + s16 (overflow inreg from VGPR lane) -> s_add,
+;         then + v0 (normal VGPR arg) -> v_add.
+; Caller writes 10 to SGPR for s0, 20 to VGPR lane for s16, and passes
+; a VGPR value for v0.
+define i32 @callee_mixed(
+; CHECK-LABEL: callee_mixed:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 2
+; CHECK-NEXT:    s_add_i32 s16, s16, s4
+; CHECK-NEXT:    v_add_u32_e32 v0, s16, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+    i32 inreg %s0,  i32 inreg %s1,  i32 inreg %s2,  i32 inreg %s3,
+    i32 inreg %s4,  i32 inreg %s5,  i32 inreg %s6,  i32 inreg %s7,
+    i32 inreg %s8,  i32 inreg %s9,  i32 inreg %s10, i32 inreg %s11,
+    i32 inreg %s12, i32 inreg %s13, i32 inreg %s14, i32 inreg %s15,
+    i32 %v0,
+    i32 inreg %s16, i32 inreg %s17) {
+  %s_sum = add i32 %s0, %s16
+  %result = add i32 %s_sum, %v0
+  ret i32 %result
+}
+
+define i32 @caller_mixed(i32 %vgpr_val) {
+; CHECK-LABEL: caller_mixed:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s42, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, callee_mixed at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, callee_mixed at gotpcrel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v2, 14, 0
+; CHECK-NEXT:    s_load_dwordx2 s[40:41], s[16:17], 0x0
+; CHECK-NEXT:    v_writelane_b32 v2, 15, 1
+; CHECK-NEXT:    v_writelane_b32 v2, 20, 2
+; CHECK-NEXT:    v_writelane_b32 v2, 21, 3
+; CHECK-NEXT:    v_writelane_b32 v3, s30, 0
+; CHECK-NEXT:    s_mov_b32 s16, 10
+; CHECK-NEXT:    s_mov_b32 s17, 1
+; CHECK-NEXT:    s_mov_b32 s18, 2
+; CHECK-NEXT:    s_mov_b32 s19, 3
+; CHECK-NEXT:    s_mov_b32 s20, 4
+; CHECK-NEXT:    s_mov_b32 s21, 5
+; CHECK-NEXT:    s_mov_b32 s22, 6
+; CHECK-NEXT:    s_mov_b32 s23, 7
+; CHECK-NEXT:    s_mov_b32 s24, 8
+; CHECK-NEXT:    s_mov_b32 s25, 9
+; CHECK-NEXT:    s_mov_b32 s26, 10
+; CHECK-NEXT:    s_mov_b32 s27, 11
+; CHECK-NEXT:    s_mov_b32 s28, 12
+; CHECK-NEXT:    s_mov_b32 s29, 13
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    v_writelane_b32 v3, s31, 1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT:    v_readlane_b32 s31, v3, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v3, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b32 s33, s42
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call i32 @callee_mixed(
+    i32 inreg 10, i32 inreg 1,  i32 inreg 2,  i32 inreg 3,
+    i32 inreg 4,  i32 inreg 5,  i32 inreg 6,  i32 inreg 7,
+    i32 inreg 8,  i32 inreg 9,  i32 inreg 10, i32 inreg 11,
+    i32 inreg 12, i32 inreg 13, i32 inreg 14, i32 inreg 15,
+    i32 %vgpr_val,
+    i32 inreg 20, i32 inreg 21)
+  ret i32 %r
+}