[llvm-branch-commits] [llvm] [AMDGPU] Fix caller/callee mismatch in SGPR assignment for inreg args (PR #182754)
Shilei Tian via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Feb 22 08:51:30 PST 2026
https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/182754
On the callee side, `LowerFormalArguments` marks SGPR0-3 as allocated in
`CCState` before running the CC analysis. On the caller side, `LowerCall` (and
GlobalISel's `lowerCall`/`lowerTailCall`) added the scratch resource to
`RegsToPass` without marking it in `CCState`. This caused `CC_AMDGPU_Func` to
treat SGPR0-3 as available on the caller side, assigning user inreg args there,
while the callee skipped them without marking it in `CCState`. This caused
`CC_AMDGPU_Func` to treat SGPR0-3 as available on the caller side, assigning
user inreg args there, while the callee skipped them.
>From 6a01b19ff3be0aea9707292bcd29951e57395ada Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Sun, 22 Feb 2026 11:50:49 -0500
Subject: [PATCH] [AMDGPU] Fix caller/callee mismatch in SGPR assignment for
inreg args
On the callee side, `LowerFormalArguments` marks SGPR0-3 as allocated in
`CCState` before running the CC analysis. On the caller side, `LowerCall` (and
GlobalISel's `lowerCall`/`lowerTailCall`) added the scratch resource to
`RegsToPass` without marking it in `CCState`. This caused `CC_AMDGPU_Func` to
treat SGPR0-3 as available on the caller side, assigning user inreg args there,
while the callee skipped them without marking it in `CCState`. This caused
`CC_AMDGPU_Func` to treat SGPR0-3 as available on the caller side, assigning
user inreg args there, while the callee skipped them.
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 12 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 +
.../AMDGPU/GlobalISel/irtranslator-call.ll | 82 ++--
.../CodeGen/AMDGPU/call-args-inreg-bfloat.ll | 2 -
...l-args-inreg-no-sgpr-for-csrspill-xfail.ll | 86 +++-
llvm/test/CodeGen/AMDGPU/call-args-inreg.ll | 412 +++++++++++++++++-
.../AMDGPU/cc-inreg-sgpr0-3-mismatch.ll | 207 +++++++--
.../CodeGen/AMDGPU/function-args-inreg.ll | 3 -
...-call-inreg-arguments.convergencetokens.ll | 6 +-
.../AMDGPU/tail-call-inreg-arguments.error.ll | 16 +-
.../AMDGPU/tail-call-inreg-arguments.ll | 12 +-
11 files changed, 730 insertions(+), 113 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 5c6affdae0c5b..8e464c6168429 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1399,6 +1399,11 @@ bool AMDGPUCallLowering::lowerTailCall(
return false;
}
+ // Mark the scratch resource descriptor as allocated so the CC analysis
+ // does not assign user arguments to these registers, matching the callee.
+ if (!ST.hasFlatScratchEnabled())
+ CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+
OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
if (!determineAssignments(Assigner, OutArgs, CCInfo))
@@ -1598,6 +1603,13 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
}
+ // Mark the scratch resource descriptor as allocated so the CC analysis
+ // does not assign user arguments to these registers, matching the callee.
+ if (!ST.hasFlatScratchEnabled()) {
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
+ }
+
// Do the actual argument marshalling.
OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
if (!determineAssignments(Assigner, OutArgs, CCInfo))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8533b1bd06d90..bf88005ce8737 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4329,6 +4329,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
+ // Mark the scratch resource descriptor as allocated so the CC analysis
+ // does not assign user arguments to these registers, matching the callee.
+ if (!Subtarget->hasFlatScratchEnabled())
+ CCInfo.AllocateReg(Info->getScratchRSrcReg());
+
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index c935310584949..2c1fd5800899a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -5378,7 +5378,7 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5390,7 +5390,7 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_i16_inreg(i16 inreg %arg)
@@ -5424,7 +5424,7 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5436,7 +5436,7 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_i32_inreg(i32 inreg %arg)
@@ -5473,9 +5473,9 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5487,7 +5487,7 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_i64_inreg(i64 inreg %arg)
@@ -5524,9 +5524,9 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5538,7 +5538,7 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v2i32_inreg(<2 x i32> inreg %arg)
@@ -5574,7 +5574,7 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5586,7 +5586,7 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_f16_inreg(half inreg %arg)
@@ -5634,7 +5634,7 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5646,7 +5646,7 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_f32_inreg(float inreg %arg)
@@ -5683,9 +5683,9 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5697,7 +5697,7 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_f64_inreg(double inreg %arg)
@@ -5732,7 +5732,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -5744,7 +5744,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v2f16_inreg(<2 x half> inreg %arg)
@@ -5787,10 +5787,10 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
; CHECK-NEXT: [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s16>)
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST1]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5802,7 +5802,7 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v3f16_inreg(<3 x half> inreg %arg)
@@ -5840,10 +5840,10 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>)
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST1]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5855,7 +5855,7 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4f16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4f16_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v4f16_inreg(<4 x half> inreg %arg)
@@ -5892,9 +5892,9 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p0)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5906,7 +5906,7 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_p0_inreg(ptr inreg %arg)
@@ -5943,9 +5943,9 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p1)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -5957,7 +5957,7 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p1_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p1_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
@@ -5992,7 +5992,7 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY9]](p3)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[PTRTOINT]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
@@ -6004,7 +6004,7 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p3_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p3_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
@@ -6045,13 +6045,13 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p1>)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
- ; CHECK-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
+ ; CHECK-NEXT: $sgpr18 = COPY [[INTRINSIC_CONVERGENT2]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
- ; CHECK-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32)
+ ; CHECK-NEXT: $sgpr19 = COPY [[INTRINSIC_CONVERGENT3]](s32)
; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY13]](p4)
@@ -6063,7 +6063,7 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
; CHECK-NEXT: $sgpr14 = COPY [[COPY19]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY20]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY21]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p1_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p1_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg)
@@ -6100,9 +6100,9 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p5>)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: $sgpr16 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
- ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
+ ; CHECK-NEXT: $sgpr17 = COPY [[INTRINSIC_CONVERGENT1]](s32)
; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
@@ -6114,7 +6114,7 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
; CHECK-NEXT: $sgpr14 = COPY [[COPY17]](s32)
; CHECK-NEXT: $sgpr15 = COPY [[COPY18]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY19]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p5_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p5_inreg, csr_amdgpu, implicit $sgpr16, implicit $sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call void @external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
index 04e472419ca61..0511db2f02ff2 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
@@ -23,7 +23,6 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_bf16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_bf16_inreg at rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -83,7 +82,6 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2bf16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2bf16_inreg at rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
index 34f4476f7fd6a..bd03e092c0fa0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll
@@ -1,22 +1,104 @@
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 < %s | FileCheck -enable-var-scope %s
-; CHECK: illegal VGPR to SGPR copy
+; This test previously hit "illegal VGPR to SGPR copy" because the caller CC
+; analysis assigned user inreg args to SGPR0-3, overlapping the scratch resource
+; descriptor. With the fix that marks SGPR0-3 as allocated before CC analysis,
+; the overlap is eliminated and these functions compile correctly.
declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0
declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0
declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0
define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 {
+; CHECK-LABEL: test_call_external_void_func_a15i32_inreg:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s40, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[42:43]
+; CHECK-NEXT: v_writelane_b32 v40, s40, 2
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: s_getpc_b64 s[40:41]
+; CHECK-NEXT: s_add_u32 s40, s40, external_void_func_a15i32_inreg at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: s_mov_b32 s32, s33
+; CHECK-NEXT: v_readlane_b32 s4, v40, 2
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0)
ret void
}
define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 {
+; CHECK-LABEL: test_call_external_void_func_a16i32_inreg:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s40, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[42:43]
+; CHECK-NEXT: v_writelane_b32 v40, s40, 2
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: s_getpc_b64 s[40:41]
+; CHECK-NEXT: s_add_u32 s40, s40, external_void_func_a16i32_inreg at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s41, s41, external_void_func_a16i32_inreg at rel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: s_mov_b32 s32, s33
+; CHECK-NEXT: v_readlane_b32 s4, v40, 2
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0)
ret void
}
define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s40, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[42:43]
+; CHECK-NEXT: v_writelane_b32 v40, s40, 2
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: s_getpc_b64 s[40:41]
+; CHECK-NEXT: s_add_u32 s40, s40, external_void_func_a15i32_inreg_i32_inreg at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg_i32_inreg at rel32@hi+12
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[40:41]
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: s_mov_b32 s32, s33
+; CHECK-NEXT: v_readlane_b32 s4, v40, 2
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index f96007ae513bd..c1b3278144d59 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -46,7 +46,6 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i8_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i8_inreg at rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -106,7 +105,6 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i16_inreg at rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -166,7 +164,6 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i32_inreg at rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -212,6 +209,33 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
}
define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_i64_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-NEXT: v_writelane_b32 v40, s18, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i64_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i64_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_i64_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -244,6 +268,33 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
}
define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i32_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-NEXT: v_writelane_b32 v40, s18, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_v2i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -276,6 +327,33 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
}
define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i32_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s19, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-NEXT: v_writelane_b32 v40, s19, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[20:21]
+; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_v3i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -308,6 +386,33 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
}
define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i32_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s20, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[22:23]
+; GFX9-NEXT: v_writelane_b32 v40, s20, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[20:21]
+; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_v4i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -340,6 +445,33 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
}
define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v8i32_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s24, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[26:27], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[26:27]
+; GFX9-NEXT: v_writelane_b32 v40, s24, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[24:25]
+; GFX9-NEXT: s_add_u32 s24, s24, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[24:25]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_v8i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -386,7 +518,6 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f16_inreg at rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -446,7 +577,6 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f32_inreg at rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -492,6 +622,33 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
}
define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_f64_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-NEXT: v_writelane_b32 v40, s18, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f64_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f64_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_f64_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -538,7 +695,6 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2f16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -584,6 +740,33 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
}
define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3f16_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-NEXT: v_writelane_b32 v40, s18, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_v3f16_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -616,6 +799,33 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
}
define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v4f16_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-NEXT: v_writelane_b32 v40, s18, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v4f16_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v4f16_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_v4f16_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -648,6 +858,33 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
}
define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_p0_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-NEXT: v_writelane_b32 v40, s18, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p0_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p0_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_p0_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -680,6 +917,33 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
}
define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_p1_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-NEXT: v_writelane_b32 v40, s18, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p1_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p1_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_p1_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -726,7 +990,6 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p3_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p3_inreg at rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
@@ -772,6 +1035,33 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
}
define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2p1_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s20, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[22:23]
+; GFX9-NEXT: v_writelane_b32 v40, s20, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[20:21]
+; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v2p1_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_v2p1_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -804,6 +1094,33 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
}
define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2p5_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-NEXT: v_writelane_b32 v40, s18, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2p5_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2p5_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_v2p5_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -836,6 +1153,33 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
}
define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg %arg1, i64 inreg %arg2) #0 {
+; GFX9-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s21, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[22:23]
+; GFX9-NEXT: v_writelane_b32 v40, s21, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[22:23]
+; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -868,6 +1212,33 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
}
define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #0 {
+; GFX9-LABEL: test_call_external_void_func_a15i32_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s29, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[40:41]
+; GFX9-NEXT: v_writelane_b32 v40, s29, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[40:41]
+; GFX9-NEXT: s_add_u32 s40, s40, external_void_func_a15i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[40:41]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_a15i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -902,6 +1273,33 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
; FIXME: This should also fail
define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inreg %arg0, i32 inreg %arg1) #1 {
+; GFX9-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s21, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[22:23]
+; GFX9-NEXT: v_writelane_b32 v40, s21, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[22:23]
+; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@hi+12
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
index d0d2d65f7edf8..86486e56d46ac 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-inreg-sgpr0-3-mismatch.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GISEL %s
; Test for a caller/callee mismatch in SGPR assignment for inreg args.
;
@@ -22,6 +23,17 @@ define i32 @callee_returns_arg0(
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s16
; CHECK-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: callee_returns_arg0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: callee_returns_arg0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s16
+; GISEL-NEXT: s_setpc_b64 s[30:31]
i32 inreg %a0, i32 inreg %a1, i32 inreg %a2, i32 inreg %a3,
i32 inreg %a4, i32 inreg %a5, i32 inreg %a6, i32 inreg %a7,
i32 inreg %a8, i32 inreg %a9, i32 inreg %a10, i32 inreg %a11,
@@ -40,58 +52,175 @@ define i32 @caller_passes_42() {
; CHECK-NEXT: s_mov_b32 s42, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[16:17]
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[40:41], s[16:17], 0x0
-; CHECK-NEXT: v_writelane_b32 v14, s30, 0
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_mov_b32 s1, 1
-; CHECK-NEXT: s_mov_b32 s2, 2
-; CHECK-NEXT: s_mov_b32 s3, 3
-; CHECK-NEXT: s_mov_b32 s16, 4
-; CHECK-NEXT: s_mov_b32 s17, 5
-; CHECK-NEXT: s_mov_b32 s18, 6
-; CHECK-NEXT: s_mov_b32 s19, 7
-; CHECK-NEXT: s_mov_b32 s20, 8
-; CHECK-NEXT: s_mov_b32 s21, 9
-; CHECK-NEXT: s_mov_b32 s22, 10
-; CHECK-NEXT: s_mov_b32 s23, 11
-; CHECK-NEXT: s_mov_b32 s24, 12
-; CHECK-NEXT: s_mov_b32 s25, 13
-; CHECK-NEXT: s_mov_b32 s26, 14
-; CHECK-NEXT: s_mov_b32 s27, 15
-; CHECK-NEXT: s_mov_b32 s28, 16
-; CHECK-NEXT: s_mov_b32 s29, 17
-; CHECK-NEXT: v_mov_b32_e32 v0, 18
-; CHECK-NEXT: v_mov_b32_e32 v1, 19
-; CHECK-NEXT: v_mov_b32_e32 v2, 20
-; CHECK-NEXT: v_mov_b32_e32 v3, 21
-; CHECK-NEXT: v_mov_b32_e32 v4, 22
-; CHECK-NEXT: v_mov_b32_e32 v5, 23
-; CHECK-NEXT: v_mov_b32_e32 v6, 24
-; CHECK-NEXT: v_mov_b32_e32 v7, 25
-; CHECK-NEXT: v_mov_b32_e32 v8, 26
-; CHECK-NEXT: v_mov_b32_e32 v9, 27
-; CHECK-NEXT: v_mov_b32_e32 v10, 28
-; CHECK-NEXT: v_mov_b32_e32 v11, 29
-; CHECK-NEXT: v_mov_b32_e32 v12, 30
-; CHECK-NEXT: v_mov_b32_e32 v13, 31
-; CHECK-NEXT: v_writelane_b32 v14, s31, 1
+; CHECK-NEXT: v_writelane_b32 v18, s30, 0
+; CHECK-NEXT: s_mov_b32 s16, 42
+; CHECK-NEXT: s_mov_b32 s17, 1
+; CHECK-NEXT: s_mov_b32 s18, 2
+; CHECK-NEXT: s_mov_b32 s19, 3
+; CHECK-NEXT: s_mov_b32 s20, 4
+; CHECK-NEXT: s_mov_b32 s21, 5
+; CHECK-NEXT: s_mov_b32 s22, 6
+; CHECK-NEXT: s_mov_b32 s23, 7
+; CHECK-NEXT: s_mov_b32 s24, 8
+; CHECK-NEXT: s_mov_b32 s25, 9
+; CHECK-NEXT: s_mov_b32 s26, 10
+; CHECK-NEXT: s_mov_b32 s27, 11
+; CHECK-NEXT: s_mov_b32 s28, 12
+; CHECK-NEXT: s_mov_b32 s29, 13
+; CHECK-NEXT: v_mov_b32_e32 v0, 14
+; CHECK-NEXT: v_mov_b32_e32 v1, 15
+; CHECK-NEXT: v_mov_b32_e32 v2, 16
+; CHECK-NEXT: v_mov_b32_e32 v3, 17
+; CHECK-NEXT: v_mov_b32_e32 v4, 18
+; CHECK-NEXT: v_mov_b32_e32 v5, 19
+; CHECK-NEXT: v_mov_b32_e32 v6, 20
+; CHECK-NEXT: v_mov_b32_e32 v7, 21
+; CHECK-NEXT: v_mov_b32_e32 v8, 22
+; CHECK-NEXT: v_mov_b32_e32 v9, 23
+; CHECK-NEXT: v_mov_b32_e32 v10, 24
+; CHECK-NEXT: v_mov_b32_e32 v11, 25
+; CHECK-NEXT: v_mov_b32_e32 v12, 26
+; CHECK-NEXT: v_mov_b32_e32 v13, 27
+; CHECK-NEXT: v_mov_b32_e32 v14, 28
+; CHECK-NEXT: v_mov_b32_e32 v15, 29
+; CHECK-NEXT: v_mov_b32_e32 v16, 30
+; CHECK-NEXT: v_mov_b32_e32 v17, 31
+; CHECK-NEXT: v_writelane_b32 v18, s31, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[40:41]
-; CHECK-NEXT: v_readlane_b32 s31, v14, 1
-; CHECK-NEXT: v_readlane_b32 s30, v14, 0
+; CHECK-NEXT: v_readlane_b32 s31, v18, 1
+; CHECK-NEXT: v_readlane_b32 s30, v18, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_mov_b32 s33, s42
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: caller_passes_42:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s42, s33
+; SDAG-NEXT: s_mov_b32 s33, s32
+; SDAG-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; SDAG-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
+; SDAG-NEXT: s_mov_b64 exec, s[16:17]
+; SDAG-NEXT: s_addk_i32 s32, 0x400
+; SDAG-NEXT: s_getpc_b64 s[16:17]
+; SDAG-NEXT: s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
+; SDAG-NEXT: s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
+; SDAG-NEXT: s_load_dwordx2 s[40:41], s[16:17], 0x0
+; SDAG-NEXT: v_writelane_b32 v18, s30, 0
+; SDAG-NEXT: s_mov_b32 s16, 42
+; SDAG-NEXT: s_mov_b32 s17, 1
+; SDAG-NEXT: s_mov_b32 s18, 2
+; SDAG-NEXT: s_mov_b32 s19, 3
+; SDAG-NEXT: s_mov_b32 s20, 4
+; SDAG-NEXT: s_mov_b32 s21, 5
+; SDAG-NEXT: s_mov_b32 s22, 6
+; SDAG-NEXT: s_mov_b32 s23, 7
+; SDAG-NEXT: s_mov_b32 s24, 8
+; SDAG-NEXT: s_mov_b32 s25, 9
+; SDAG-NEXT: s_mov_b32 s26, 10
+; SDAG-NEXT: s_mov_b32 s27, 11
+; SDAG-NEXT: s_mov_b32 s28, 12
+; SDAG-NEXT: s_mov_b32 s29, 13
+; SDAG-NEXT: v_mov_b32_e32 v0, 14
+; SDAG-NEXT: v_mov_b32_e32 v1, 15
+; SDAG-NEXT: v_mov_b32_e32 v2, 16
+; SDAG-NEXT: v_mov_b32_e32 v3, 17
+; SDAG-NEXT: v_mov_b32_e32 v4, 18
+; SDAG-NEXT: v_mov_b32_e32 v5, 19
+; SDAG-NEXT: v_mov_b32_e32 v6, 20
+; SDAG-NEXT: v_mov_b32_e32 v7, 21
+; SDAG-NEXT: v_mov_b32_e32 v8, 22
+; SDAG-NEXT: v_mov_b32_e32 v9, 23
+; SDAG-NEXT: v_mov_b32_e32 v10, 24
+; SDAG-NEXT: v_mov_b32_e32 v11, 25
+; SDAG-NEXT: v_mov_b32_e32 v12, 26
+; SDAG-NEXT: v_mov_b32_e32 v13, 27
+; SDAG-NEXT: v_mov_b32_e32 v14, 28
+; SDAG-NEXT: v_mov_b32_e32 v15, 29
+; SDAG-NEXT: v_mov_b32_e32 v16, 30
+; SDAG-NEXT: v_mov_b32_e32 v17, 31
+; SDAG-NEXT: v_writelane_b32 v18, s31, 1
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[40:41]
+; SDAG-NEXT: v_readlane_b32 s31, v18, 1
+; SDAG-NEXT: v_readlane_b32 s30, v18, 0
+; SDAG-NEXT: s_mov_b32 s32, s33
+; SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; SDAG-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
+; SDAG-NEXT: s_mov_b64 exec, s[4:5]
+; SDAG-NEXT: s_mov_b32 s33, s42
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: caller_passes_42:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s42, s33
+; GISEL-NEXT: s_mov_b32 s33, s32
+; GISEL-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GISEL-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
+; GISEL-NEXT: s_mov_b64 exec, s[16:17]
+; GISEL-NEXT: s_addk_i32 s32, 0x400
+; GISEL-NEXT: s_getpc_b64 s[16:17]
+; GISEL-NEXT: s_add_u32 s16, s16, callee_returns_arg0 at gotpcrel32@lo+4
+; GISEL-NEXT: s_addc_u32 s17, s17, callee_returns_arg0 at gotpcrel32@hi+12
+; GISEL-NEXT: s_load_dwordx2 s[40:41], s[16:17], 0x0
+; GISEL-NEXT: v_writelane_b32 v18, s30, 0
+; GISEL-NEXT: s_mov_b32 s16, 42
+; GISEL-NEXT: s_mov_b32 s17, 1
+; GISEL-NEXT: s_mov_b32 s18, 2
+; GISEL-NEXT: s_mov_b32 s19, 3
+; GISEL-NEXT: s_mov_b32 s20, 4
+; GISEL-NEXT: s_mov_b32 s21, 5
+; GISEL-NEXT: s_mov_b32 s22, 6
+; GISEL-NEXT: s_mov_b32 s23, 7
+; GISEL-NEXT: s_mov_b32 s24, 8
+; GISEL-NEXT: s_mov_b32 s25, 9
+; GISEL-NEXT: s_mov_b32 s26, 10
+; GISEL-NEXT: s_mov_b32 s27, 11
+; GISEL-NEXT: s_mov_b32 s28, 12
+; GISEL-NEXT: s_mov_b32 s29, 13
+; GISEL-NEXT: v_mov_b32_e32 v0, 14
+; GISEL-NEXT: v_mov_b32_e32 v1, 15
+; GISEL-NEXT: v_mov_b32_e32 v2, 16
+; GISEL-NEXT: v_mov_b32_e32 v3, 17
+; GISEL-NEXT: v_mov_b32_e32 v4, 18
+; GISEL-NEXT: v_mov_b32_e32 v5, 19
+; GISEL-NEXT: v_mov_b32_e32 v6, 20
+; GISEL-NEXT: v_mov_b32_e32 v7, 21
+; GISEL-NEXT: v_mov_b32_e32 v8, 22
+; GISEL-NEXT: v_mov_b32_e32 v9, 23
+; GISEL-NEXT: v_mov_b32_e32 v10, 24
+; GISEL-NEXT: v_mov_b32_e32 v11, 25
+; GISEL-NEXT: v_mov_b32_e32 v12, 26
+; GISEL-NEXT: v_mov_b32_e32 v13, 27
+; GISEL-NEXT: v_mov_b32_e32 v14, 28
+; GISEL-NEXT: v_mov_b32_e32 v15, 29
+; GISEL-NEXT: v_mov_b32_e32 v16, 30
+; GISEL-NEXT: v_mov_b32_e32 v17, 31
+; GISEL-NEXT: v_writelane_b32 v18, s31, 1
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[40:41]
+; GISEL-NEXT: v_readlane_b32 s31, v18, 1
+; GISEL-NEXT: v_readlane_b32 s30, v18, 0
+; GISEL-NEXT: s_mov_b32 s32, s33
+; GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GISEL-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
+; GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GISEL-NEXT: s_mov_b32 s33, s42
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @callee_returns_arg0(
i32 inreg 42, i32 inreg 1, i32 inreg 2, i32 inreg 3,
i32 inreg 4, i32 inreg 5, i32 inreg 6, i32 inreg 7,
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 831d10480c51c..0bf987ab6f19c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1734,9 +1734,6 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
; GFX9-NEXT: s_load_dwordx2 s[20:21], s[20:21], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s19, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s2, s18
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll
index 41e7ad7b8e2fa..86275a8444c45 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll
@@ -43,10 +43,10 @@ define void @tail_call_i64_inreg_uniform_in_vgpr_convergence_tokens() #0 {
; CHECK-NEXT: $sgpr14 = COPY [[COPY2]]
; CHECK-NEXT: $sgpr15 = COPY [[COPY1]]
; CHECK-NEXT: $vgpr31 = COPY [[COPY]]
- ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; CHECK-NEXT: $sgpr16 = COPY [[V_READFIRSTLANE_B32_]]
+ ; CHECK-NEXT: $sgpr17 = COPY [[V_READFIRSTLANE_B32_1]]
; CHECK-NEXT: CONVERGENCECTRL_GLUE [[CONVERGENCECTRL_ENTRY]]
- ; CHECK-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @void_func_i64_inreg, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0, implicit $sgpr1, implicit [[CONVERGENCECTRL_ENTRY]]
+ ; CHECK-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @void_func_i64_inreg, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr16, implicit $sgpr17, implicit [[CONVERGENCECTRL_ENTRY]]
%t = call token @llvm.experimental.convergence.entry()
%uniform.vgpr = load i64, ptr addrspace(3) zeroinitializer, align 8
tail call void @void_func_i64_inreg(i64 inreg %uniform.vgpr) #0 [ "convergencectrl"(token %t) ]
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
index 242b5e9aeaf42..cb34262e2d4ee 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
@@ -21,11 +21,11 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) {
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, void_func_i32_inreg at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i32_inreg at rel32@hi+12
-; CHECK-NEXT: ; illegal copy v0 to s0
-; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: s_getpc_b64 s[18:19]
+; CHECK-NEXT: s_add_u32 s18, s18, void_func_i32_inreg at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s19, s19, void_func_i32_inreg at rel32@hi+12
+; CHECK-NEXT: ; illegal copy v0 to s16
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19]
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: s_mov_b32 s32, s33
@@ -56,12 +56,12 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) {
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, constant at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, constant at rel32@hi+12
-; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
-; CHECK-NEXT: ; illegal copy v0 to s0
+; CHECK-NEXT: ; illegal copy v0 to s16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19]
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: s_mov_b32 s32, s33
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
index 5506b29b99895..2449812e61661 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
@@ -10,7 +10,6 @@ define void @tail_call_i32_inreg_uniform(i32 inreg %sgpr) {
; CHECK-NEXT: s_getpc_b64 s[18:19]
; CHECK-NEXT: s_add_u32 s18, s18, void_func_i32_inreg at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s19, s19, void_func_i32_inreg at rel32@hi+12
-; CHECK-NEXT: s_mov_b32 s0, s16
; CHECK-NEXT: s_setpc_b64 s[18:19]
tail call void @void_func_i32_inreg(i32 inreg %sgpr)
ret void
@@ -26,7 +25,6 @@ define void @indirect_tail_call_i32_inreg_uniform(i32 inreg %sgpr) {
; CHECK-NEXT: s_add_u32 s18, s18, constant at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s19, s19, constant at rel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0
-; CHECK-NEXT: s_mov_b32 s0, s16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[18:19]
%fptr = load ptr, ptr addrspace(4) @constant, align 8
@@ -44,8 +42,6 @@ define void @tail_call_i64_inreg_uniform(i64 inreg %sgpr) {
; CHECK-NEXT: s_add_u32 s18, s18, void_func_i64_inreg at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s19, s19, void_func_i64_inreg at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0
-; CHECK-NEXT: s_mov_b32 s1, s17
-; CHECK-NEXT: s_mov_b32 s0, s16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[18:19]
tail call void @void_func_i64_inreg(i64 inreg %sgpr)
@@ -61,11 +57,11 @@ define void @tail_call_i64_inreg_uniform_in_vgpr() {
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, void_func_i64_inreg at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i64_inreg at gotpcrel32@hi+12
-; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
-; CHECK-NEXT: s_setpc_b64 s[16:17]
+; CHECK-NEXT: v_readfirstlane_b32 s16, v0
+; CHECK-NEXT: v_readfirstlane_b32 s17, v1
+; CHECK-NEXT: s_setpc_b64 s[18:19]
%uniform.vgpr = load i64, ptr addrspace(3) zeroinitializer, align 8
tail call void @void_func_i64_inreg(i64 inreg %uniform.vgpr)
ret void
More information about the llvm-branch-commits
mailing list