[llvm-branch-commits] [llvm] [AMDGPU] Fix caller/callee mismatch in SGPR assignment for inreg args (PR #182754)

Shilei Tian via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Sun Feb 22 08:51:30 PST 2026


https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/182754

On the callee side, `LowerFormalArguments` marks SGPR0-3 as allocated in
`CCState` before running the CC analysis. On the caller side, `LowerCall` (and
GlobalISel's `lowerCall`/`lowerTailCall`) added the scratch resource to
`RegsToPass` without marking it in `CCState`. This caused `CC_AMDGPU_Func` to
treat SGPR0-3 as available on the caller side, assigning user inreg args there,
while the callee skipped them without marking it in `CCState`. This caused
`CC_AMDGPU_Func` to treat SGPR0-3 as available on the caller side, assigning
user inreg args there, while the callee skipped them.

>From 6a01b19ff3be0aea9707292bcd29951e57395ada Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Sun, 22 Feb 2026 11:50:49 -0500
Subject: [PATCH] [AMDGPU] Fix caller/callee mismatch in SGPR assignment for
 inreg args

On the callee side, `LowerFormalArguments` marks SGPR0-3 as allocated in
`CCState` before running the CC analysis. On the caller side, `LowerCall` (and
GlobalISel's `lowerCall`/`lowerTailCall`) added the scratch resource to
`RegsToPass` without marking it in `CCState`. This caused `CC_AMDGPU_Func` to
treat SGPR0-3 as available on the caller side, assigning user inreg args there,
while the callee skipped them without marking it in `CCState`. This caused
`CC_AMDGPU_Func` to treat SGPR0-3 as available on the caller side, assigning
user inreg args there, while the callee skipped them.
---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  12 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   5 +
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |  82 ++--
 .../CodeGen/AMDGPU/call-args-inreg-bfloat.ll  |   2 -
 ...l-args-inreg-no-sgpr-for-csrspill-xfail.ll |  86 +++-
 llvm/test/CodeGen/AMDGPU/call-args-inreg.ll   | 412 +++++++++++++++++-
 .../AMDGPU/cc-inreg-sgpr0-3-mismatch.ll       | 207 +++++++--
 .../CodeGen/AMDGPU/function-args-inreg.ll     |   3 -
 ...-call-inreg-arguments.convergencetokens.ll |   6 +-
 .../AMDGPU/tail-call-inreg-arguments.error.ll |  16 +-
 .../AMDGPU/tail-call-inreg-arguments.ll       |  12 +-
 11 files changed, 730 insertions(+), 113 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 5c6affdae0c5b..8e464c6168429 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1399,6 +1399,11 @@ bool AMDGPUCallLowering::lowerTailCall(
       return false;
   }
 
+  // Mark the scratch resource descriptor as allocated so the CC analysis
+  // does not assign user arguments to these registers, matching the callee.
+  if (!ST.hasFlatScratchEnabled())
+    CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+
   OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
 
   if (!determineAssignments(Assigner, OutArgs, CCInfo))
@@ -1598,6 +1603,13 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       return false;
   }
 
+  // Mark the scratch resource descriptor as allocated so the CC analysis
+  // does not assign user arguments to these registers, matching the callee.
+  if (!ST.hasFlatScratchEnabled()) {
+    const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+    CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+  }
+
   // Do the actual argument marshalling.
   OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
   if (!determineAssignments(Assigner, OutArgs, CCInfo))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8533b1bd06d90..bf88005ce8737 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4329,6 +4329,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
 
+  // Mark the scratch resource descriptor as allocated so the CC analysis
+  // does not assign user arguments to these registers, matching the callee.
+  if (!Subtarget->hasFlatScratchEnabled())
+    CCInfo.AllocateReg(Info->getScratchRSrcReg());
+
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
 
   // Get a count of how many bytes are to be pushed on the stack.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index c935310584949..2c1fd5800899a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -5378,7 +5378,7 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5390,7 +5390,7 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_i16_inreg(i16 inreg %arg)
@@ -5424,7 +5424,7 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5436,7 +5436,7 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_i32_inreg(i32 inreg %arg)
@@ -5473,9 +5473,9 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5487,7 +5487,7 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_i64_inreg(i64 inreg %arg)
@@ -5524,9 +5524,9 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5538,7 +5538,7 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2i32_inreg(<2 x i32> inreg %arg)
@@ -5574,7 +5574,7 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5586,7 +5586,7 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_f16_inreg(half inreg %arg)
@@ -5634,7 +5634,7 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5646,7 +5646,7 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_f32_inreg(float inreg %arg)
@@ -5683,9 +5683,9 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5697,7 +5697,7 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_f64_inreg(double inreg %arg)
@@ -5732,7 +5732,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5744,7 +5744,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2f16_inreg(<2 x half> inreg %arg)
@@ -5787,10 +5787,10 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
   ; CHECK-NEXT:   [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s16>)
   ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5802,7 +5802,7 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v3f16_inreg(<3 x half> inreg %arg)
@@ -5840,10 +5840,10 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>)
   ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5855,7 +5855,7 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v4f16_inreg(<4 x half> inreg %arg)
@@ -5892,9 +5892,9 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p0)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5906,7 +5906,7 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_p0_inreg(ptr inreg %arg)
@@ -5943,9 +5943,9 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p1)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5957,7 +5957,7 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p1_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p1_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
@@ -5992,7 +5992,7 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY9]](p3)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[PTRTOINT]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -6004,7 +6004,7 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p3_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p3_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
@@ -6045,13 +6045,13 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
   ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p1>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
-  ; CHECK-NEXT:   $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
+  ; CHECK-NEXT:   $sgpr18 = COPY [[INTRINSIC_CONVERGENT2]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
-  ; CHECK-NEXT:   $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32)
+  ; CHECK-NEXT:   $sgpr19 = COPY [[INTRINSIC_CONVERGENT3]](s32)
   ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY13]](p4)
@@ -6063,7 +6063,7 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY19]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY20]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY21]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p1_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p1_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg)
@@ -6100,9 +6100,9 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p5>)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
-  ; CHECK-NEXT:   $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+  ; CHECK-NEXT:   $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -6114,7 +6114,7 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY17]](s32)
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY18]](s32)
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY19]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p5_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p5_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call void @external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
index 04e472419ca61..0511db2f02ff2 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
@@ -23,7 +23,6 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_bf16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_bf16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -83,7 +82,6 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2bf16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2bf16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
index 34f4476f7fd6a..bd03e092c0fa0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
@@ -1,22 +1,104 @@
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 < %s | FileCheck -enable-var-scope %s
 
-; CHECK: illegal VGPR to SGPR copy
+; This test previously hit "illegal VGPR to SGPR copy" because the caller CC
+; analysis assigned user inreg args to SGPR0-3, overlapping the scratch resource
+; descriptor. With the fix that marks SGPR0-3 as allocated before CC analysis,
+; the overlap is eliminated and these functions compile correctly.
 
 declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0
 declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0
 declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0
 
 define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 {
+; CHECK-LABEL: test_call_external_void_func_a15i32_inreg:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s40, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[42:43]
+; CHECK-NEXT:    v_writelane_b32 v40, s40, 2
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[40:41]
+; CHECK-NEXT:    s_add_u32 s40, s40, external_void_func_a15i32_inreg at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0)
   ret void
 }
 
 define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 {
+; CHECK-LABEL: test_call_external_void_func_a16i32_inreg:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s40, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[42:43]
+; CHECK-NEXT:    v_writelane_b32 v40, s40, 2
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[40:41]
+; CHECK-NEXT:    s_add_u32 s40, s40, external_void_func_a16i32_inreg at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s41, s41, external_void_func_a16i32_inreg at rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0)
   ret void
 }
 
 define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s40, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[42:43]
+; CHECK-NEXT:    v_writelane_b32 v40, s40, 2
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[40:41]
+; CHECK-NEXT:    s_add_u32 s40, s40, external_void_func_a15i32_inreg_i32_inreg at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s41, s41, external_void_func_a15i32_inreg_i32_inreg at rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index f96007ae513bd..c1b3278144d59 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -46,7 +46,6 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i8_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i8_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -106,7 +105,6 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -166,7 +164,6 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -212,6 +209,33 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_i64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -244,6 +268,33 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v2i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -276,6 +327,33 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s19, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s19, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v3i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -308,6 +386,33 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s20, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-NEXT:    v_writelane_b32 v40, s20, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v4i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -340,6 +445,33 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v8i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s24, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[26:27], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[26:27]
+; GFX9-NEXT:    v_writelane_b32 v40, s24, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[24:25]
+; GFX9-NEXT:    s_add_u32 s24, s24, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[24:25]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v8i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -386,7 +518,6 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -446,7 +577,6 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f32_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -492,6 +622,33 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_f64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -538,7 +695,6 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2f16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -584,6 +740,33 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 }
 
 define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v3f16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -616,6 +799,33 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
 }
 
 define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v4f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v4f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v4f16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v4f16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -648,6 +858,33 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
 }
 
 define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_p0_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p0_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p0_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_p0_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -680,6 +917,33 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_p1_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p1_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p1_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_p1_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -726,7 +990,6 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 ; GFX9-NEXT:    s_getpc_b64 s[18:19]
 ; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p3_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p3_inreg at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
@@ -772,6 +1035,33 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 }
 
 define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2p1_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s20, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-NEXT:    v_writelane_b32 v40, s20, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v2p1_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v2p1_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -804,6 +1094,33 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
 }
 
 define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2p5_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s18, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2p5_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2p5_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_v2p5_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -836,6 +1153,33 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
 }
 
 define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg %arg1, i64 inreg %arg2) #0 {
+; GFX9-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s21, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-NEXT:    v_writelane_b32 v40, s21, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[22:23]
+; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -868,6 +1212,33 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
 }
 
 define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #0 {
+; GFX9-LABEL: test_call_external_void_func_a15i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s29, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[40:41]
+; GFX9-NEXT:    v_writelane_b32 v40, s29, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[40:41]
+; GFX9-NEXT:    s_add_u32 s40, s40, external_void_func_a15i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_a15i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -902,6 +1273,33 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
 
 ; FIXME: This should also fail
 define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inreg %arg0, i32 inreg %arg1) #1 {
+; GFX9-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s21, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-NEXT:    v_writelane_b32 v40, s21, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[22:23]
+; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
index d0d2d65f7edf8..86486e56d46ac 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GISEL %s
 
 ; Test for a caller/callee mismatch in SGPR assignment for inreg args.
 ;
@@ -22,6 +23,17 @@ define i32 @callee_returns_arg0(
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: callee_returns_arg0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: callee_returns_arg0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s16
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
     i32 inreg %a0,  i32 inreg %a1,  i32 inreg %a2,  i32 inreg %a3,
     i32 inreg %a4,  i32 inreg %a5,  i32 inreg %a6,  i32 inreg %a7,
     i32 inreg %a8,  i32 inreg %a9,  i32 inreg %a10, i32 inreg %a11,
@@ -40,58 +52,175 @@ define i32 @caller_passes_42() {
 ; CHECK-NEXT:    s_mov_b32 s42, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
-; CHECK-NEXT:    buffer_store_dword v14, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[40:41], s[16:17], 0x0
-; CHECK-NEXT:    v_writelane_b32 v14, s30, 0
-; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_mov_b32 s1, 1
-; CHECK-NEXT:    s_mov_b32 s2, 2
-; CHECK-NEXT:    s_mov_b32 s3, 3
-; CHECK-NEXT:    s_mov_b32 s16, 4
-; CHECK-NEXT:    s_mov_b32 s17, 5
-; CHECK-NEXT:    s_mov_b32 s18, 6
-; CHECK-NEXT:    s_mov_b32 s19, 7
-; CHECK-NEXT:    s_mov_b32 s20, 8
-; CHECK-NEXT:    s_mov_b32 s21, 9
-; CHECK-NEXT:    s_mov_b32 s22, 10
-; CHECK-NEXT:    s_mov_b32 s23, 11
-; CHECK-NEXT:    s_mov_b32 s24, 12
-; CHECK-NEXT:    s_mov_b32 s25, 13
-; CHECK-NEXT:    s_mov_b32 s26, 14
-; CHECK-NEXT:    s_mov_b32 s27, 15
-; CHECK-NEXT:    s_mov_b32 s28, 16
-; CHECK-NEXT:    s_mov_b32 s29, 17
-; CHECK-NEXT:    v_mov_b32_e32 v0, 18
-; CHECK-NEXT:    v_mov_b32_e32 v1, 19
-; CHECK-NEXT:    v_mov_b32_e32 v2, 20
-; CHECK-NEXT:    v_mov_b32_e32 v3, 21
-; CHECK-NEXT:    v_mov_b32_e32 v4, 22
-; CHECK-NEXT:    v_mov_b32_e32 v5, 23
-; CHECK-NEXT:    v_mov_b32_e32 v6, 24
-; CHECK-NEXT:    v_mov_b32_e32 v7, 25
-; CHECK-NEXT:    v_mov_b32_e32 v8, 26
-; CHECK-NEXT:    v_mov_b32_e32 v9, 27
-; CHECK-NEXT:    v_mov_b32_e32 v10, 28
-; CHECK-NEXT:    v_mov_b32_e32 v11, 29
-; CHECK-NEXT:    v_mov_b32_e32 v12, 30
-; CHECK-NEXT:    v_mov_b32_e32 v13, 31
-; CHECK-NEXT:    v_writelane_b32 v14, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v18, s30, 0
+; CHECK-NEXT:    s_mov_b32 s16, 42
+; CHECK-NEXT:    s_mov_b32 s17, 1
+; CHECK-NEXT:    s_mov_b32 s18, 2
+; CHECK-NEXT:    s_mov_b32 s19, 3
+; CHECK-NEXT:    s_mov_b32 s20, 4
+; CHECK-NEXT:    s_mov_b32 s21, 5
+; CHECK-NEXT:    s_mov_b32 s22, 6
+; CHECK-NEXT:    s_mov_b32 s23, 7
+; CHECK-NEXT:    s_mov_b32 s24, 8
+; CHECK-NEXT:    s_mov_b32 s25, 9
+; CHECK-NEXT:    s_mov_b32 s26, 10
+; CHECK-NEXT:    s_mov_b32 s27, 11
+; CHECK-NEXT:    s_mov_b32 s28, 12
+; CHECK-NEXT:    s_mov_b32 s29, 13
+; CHECK-NEXT:    v_mov_b32_e32 v0, 14
+; CHECK-NEXT:    v_mov_b32_e32 v1, 15
+; CHECK-NEXT:    v_mov_b32_e32 v2, 16
+; CHECK-NEXT:    v_mov_b32_e32 v3, 17
+; CHECK-NEXT:    v_mov_b32_e32 v4, 18
+; CHECK-NEXT:    v_mov_b32_e32 v5, 19
+; CHECK-NEXT:    v_mov_b32_e32 v6, 20
+; CHECK-NEXT:    v_mov_b32_e32 v7, 21
+; CHECK-NEXT:    v_mov_b32_e32 v8, 22
+; CHECK-NEXT:    v_mov_b32_e32 v9, 23
+; CHECK-NEXT:    v_mov_b32_e32 v10, 24
+; CHECK-NEXT:    v_mov_b32_e32 v11, 25
+; CHECK-NEXT:    v_mov_b32_e32 v12, 26
+; CHECK-NEXT:    v_mov_b32_e32 v13, 27
+; CHECK-NEXT:    v_mov_b32_e32 v14, 28
+; CHECK-NEXT:    v_mov_b32_e32 v15, 29
+; CHECK-NEXT:    v_mov_b32_e32 v16, 30
+; CHECK-NEXT:    v_mov_b32_e32 v17, 31
+; CHECK-NEXT:    v_writelane_b32 v18, s31, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT:    v_readlane_b32 s31, v14, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v14, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v18, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v18, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_mov_b32 s33, s42
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: caller_passes_42:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s42, s33
+; SDAG-NEXT:    s_mov_b32 s33, s32
+; SDAG-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; SDAG-NEXT:    buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
+; SDAG-NEXT:    s_mov_b64 exec, s[16:17]
+; SDAG-NEXT:    s_addk_i32 s32, 0x400
+; SDAG-NEXT:    s_getpc_b64 s[16:17]
+; SDAG-NEXT:    s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
+; SDAG-NEXT:    s_load_dwordx2 s[40:41], s[16:17], 0x0
+; SDAG-NEXT:    v_writelane_b32 v18, s30, 0
+; SDAG-NEXT:    s_mov_b32 s16, 42
+; SDAG-NEXT:    s_mov_b32 s17, 1
+; SDAG-NEXT:    s_mov_b32 s18, 2
+; SDAG-NEXT:    s_mov_b32 s19, 3
+; SDAG-NEXT:    s_mov_b32 s20, 4
+; SDAG-NEXT:    s_mov_b32 s21, 5
+; SDAG-NEXT:    s_mov_b32 s22, 6
+; SDAG-NEXT:    s_mov_b32 s23, 7
+; SDAG-NEXT:    s_mov_b32 s24, 8
+; SDAG-NEXT:    s_mov_b32 s25, 9
+; SDAG-NEXT:    s_mov_b32 s26, 10
+; SDAG-NEXT:    s_mov_b32 s27, 11
+; SDAG-NEXT:    s_mov_b32 s28, 12
+; SDAG-NEXT:    s_mov_b32 s29, 13
+; SDAG-NEXT:    v_mov_b32_e32 v0, 14
+; SDAG-NEXT:    v_mov_b32_e32 v1, 15
+; SDAG-NEXT:    v_mov_b32_e32 v2, 16
+; SDAG-NEXT:    v_mov_b32_e32 v3, 17
+; SDAG-NEXT:    v_mov_b32_e32 v4, 18
+; SDAG-NEXT:    v_mov_b32_e32 v5, 19
+; SDAG-NEXT:    v_mov_b32_e32 v6, 20
+; SDAG-NEXT:    v_mov_b32_e32 v7, 21
+; SDAG-NEXT:    v_mov_b32_e32 v8, 22
+; SDAG-NEXT:    v_mov_b32_e32 v9, 23
+; SDAG-NEXT:    v_mov_b32_e32 v10, 24
+; SDAG-NEXT:    v_mov_b32_e32 v11, 25
+; SDAG-NEXT:    v_mov_b32_e32 v12, 26
+; SDAG-NEXT:    v_mov_b32_e32 v13, 27
+; SDAG-NEXT:    v_mov_b32_e32 v14, 28
+; SDAG-NEXT:    v_mov_b32_e32 v15, 29
+; SDAG-NEXT:    v_mov_b32_e32 v16, 30
+; SDAG-NEXT:    v_mov_b32_e32 v17, 31
+; SDAG-NEXT:    v_writelane_b32 v18, s31, 1
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; SDAG-NEXT:    v_readlane_b32 s31, v18, 1
+; SDAG-NEXT:    v_readlane_b32 s30, v18, 0
+; SDAG-NEXT:    s_mov_b32 s32, s33
+; SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SDAG-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
+; SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; SDAG-NEXT:    s_mov_b32 s33, s42
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: caller_passes_42:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s42, s33
+; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; GISEL-NEXT:    buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
+; GISEL-NEXT:    s_mov_b64 exec, s[16:17]
+; GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GISEL-NEXT:    s_getpc_b64 s[16:17]
+; GISEL-NEXT:    s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
+; GISEL-NEXT:    s_load_dwordx2 s[40:41], s[16:17], 0x0
+; GISEL-NEXT:    v_writelane_b32 v18, s30, 0
+; GISEL-NEXT:    s_mov_b32 s16, 42
+; GISEL-NEXT:    s_mov_b32 s17, 1
+; GISEL-NEXT:    s_mov_b32 s18, 2
+; GISEL-NEXT:    s_mov_b32 s19, 3
+; GISEL-NEXT:    s_mov_b32 s20, 4
+; GISEL-NEXT:    s_mov_b32 s21, 5
+; GISEL-NEXT:    s_mov_b32 s22, 6
+; GISEL-NEXT:    s_mov_b32 s23, 7
+; GISEL-NEXT:    s_mov_b32 s24, 8
+; GISEL-NEXT:    s_mov_b32 s25, 9
+; GISEL-NEXT:    s_mov_b32 s26, 10
+; GISEL-NEXT:    s_mov_b32 s27, 11
+; GISEL-NEXT:    s_mov_b32 s28, 12
+; GISEL-NEXT:    s_mov_b32 s29, 13
+; GISEL-NEXT:    v_mov_b32_e32 v0, 14
+; GISEL-NEXT:    v_mov_b32_e32 v1, 15
+; GISEL-NEXT:    v_mov_b32_e32 v2, 16
+; GISEL-NEXT:    v_mov_b32_e32 v3, 17
+; GISEL-NEXT:    v_mov_b32_e32 v4, 18
+; GISEL-NEXT:    v_mov_b32_e32 v5, 19
+; GISEL-NEXT:    v_mov_b32_e32 v6, 20
+; GISEL-NEXT:    v_mov_b32_e32 v7, 21
+; GISEL-NEXT:    v_mov_b32_e32 v8, 22
+; GISEL-NEXT:    v_mov_b32_e32 v9, 23
+; GISEL-NEXT:    v_mov_b32_e32 v10, 24
+; GISEL-NEXT:    v_mov_b32_e32 v11, 25
+; GISEL-NEXT:    v_mov_b32_e32 v12, 26
+; GISEL-NEXT:    v_mov_b32_e32 v13, 27
+; GISEL-NEXT:    v_mov_b32_e32 v14, 28
+; GISEL-NEXT:    v_mov_b32_e32 v15, 29
+; GISEL-NEXT:    v_mov_b32_e32 v16, 30
+; GISEL-NEXT:    v_mov_b32_e32 v17, 31
+; GISEL-NEXT:    v_writelane_b32 v18, s31, 1
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; GISEL-NEXT:    v_readlane_b32 s31, v18, 1
+; GISEL-NEXT:    v_readlane_b32 s30, v18, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
+; GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GISEL-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL-NEXT:    s_mov_b32 s33, s42
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @callee_returns_arg0(
     i32 inreg 42, i32 inreg 1,  i32 inreg 2,  i32 inreg 3,
     i32 inreg 4,  i32 inreg 5,  i32 inreg 6,  i32 inreg 7,
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 831d10480c51c..0bf987ab6f19c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1734,9 +1734,6 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ; GFX9-NEXT:    s_load_dwordx2 s[20:21], s[20:21], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_mov_b32 s2, s18
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll
index 41e7ad7b8e2fa..86275a8444c45 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll
@@ -43,10 +43,10 @@ define void @tail_call_i64_inreg_uniform_in_vgpr_convergence_tokens() #0 {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY2]]
   ; CHECK-NEXT:   $sgpr15 = COPY [[COPY1]]
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY]]
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK-NEXT:   $sgpr16 = COPY [[V_READFIRSTLANE_B32_]]
+  ; CHECK-NEXT:   $sgpr17 = COPY [[V_READFIRSTLANE_B32_1]]
   ; CHECK-NEXT:   CONVERGENCECTRL_GLUE [[CONVERGENCECTRL_ENTRY]]
-  ; CHECK-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @void_func_i64_inreg, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0, implicit $sgpr1, implicit [[CONVERGENCECTRL_ENTRY]]
+  ; CHECK-NEXT:   SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @void_func_i64_inreg, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr16, implicit $sgpr17, implicit [[CONVERGENCECTRL_ENTRY]]
   %t = call token @llvm.experimental.convergence.entry()
   %uniform.vgpr = load i64, ptr addrspace(3) zeroinitializer, align 8
   tail call void @void_func_i64_inreg(i64 inreg %uniform.vgpr) #0 [ "convergencectrl"(token %t) ]
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
index 242b5e9aeaf42..cb34262e2d4ee 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
@@ -21,11 +21,11 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) {
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, void_func_i32_inreg at rel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, void_func_i32_inreg at rel32@hi+12
-; CHECK-NEXT:     ; illegal copy v0 to s0
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    s_getpc_b64 s[18:19]
+; CHECK-NEXT:    s_add_u32 s18, s18, void_func_i32_inreg at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s19, s19, void_func_i32_inreg at rel32@hi+12
+; CHECK-NEXT:     ; illegal copy v0 to s16
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
@@ -56,12 +56,12 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) {
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, constant at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, constant at rel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:     ; illegal copy v0 to s0
+; CHECK-NEXT:     ; illegal copy v0 to s16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    s_mov_b32 s32, s33
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
index 5506b29b99895..2449812e61661 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
@@ -10,7 +10,6 @@ define void @tail_call_i32_inreg_uniform(i32 inreg %sgpr) {
 ; CHECK-NEXT:    s_getpc_b64 s[18:19]
 ; CHECK-NEXT:    s_add_u32 s18, s18, void_func_i32_inreg at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s19, s19, void_func_i32_inreg at rel32@hi+12
-; CHECK-NEXT:    s_mov_b32 s0, s16
 ; CHECK-NEXT:    s_setpc_b64 s[18:19]
   tail call void @void_func_i32_inreg(i32 inreg %sgpr)
   ret void
@@ -26,7 +25,6 @@ define void @indirect_tail_call_i32_inreg_uniform(i32 inreg %sgpr) {
 ; CHECK-NEXT:    s_add_u32 s18, s18, constant at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s19, s19, constant at rel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[18:19], 0x0
-; CHECK-NEXT:    s_mov_b32 s0, s16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[18:19]
   %fptr = load ptr, ptr addrspace(4) @constant, align 8
@@ -44,8 +42,6 @@ define void @tail_call_i64_inreg_uniform(i64 inreg %sgpr) {
 ; CHECK-NEXT:    s_add_u32 s18, s18, void_func_i64_inreg at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s19, s19, void_func_i64_inreg at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[18:19], 0x0
-; CHECK-NEXT:    s_mov_b32 s1, s17
-; CHECK-NEXT:    s_mov_b32 s0, s16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[18:19]
   tail call void @void_func_i64_inreg(i64 inreg %sgpr)
@@ -61,11 +57,11 @@ define void @tail_call_i64_inreg_uniform_in_vgpr() {
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, void_func_i64_inreg at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, void_func_i64_inreg at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-NEXT:    v_readfirstlane_b32 s1, v1
-; CHECK-NEXT:    s_setpc_b64 s[16:17]
+; CHECK-NEXT:    v_readfirstlane_b32 s16, v0
+; CHECK-NEXT:    v_readfirstlane_b32 s17, v1
+; CHECK-NEXT:    s_setpc_b64 s[18:19]
   %uniform.vgpr = load i64, ptr addrspace(3) zeroinitializer, align 8
   tail call void @void_func_i64_inreg(i64 inreg %uniform.vgpr)
   ret void



More information about the llvm-branch-commits mailing list