[llvm] 1ab8b9a - AMDGPU: Define sub-class of SGPR_64 for tail call return
Changpeng Fang via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 27 10:45:56 PDT 2023
Author: Changpeng Fang
Date: 2023-04-27T10:45:11-07:00
New Revision: 1ab8b9ae159bf6048da9e7350d4c2f694912501f
URL: https://github.com/llvm/llvm-project/commit/1ab8b9ae159bf6048da9e7350d4c2f694912501f
DIFF: https://github.com/llvm/llvm-project/commit/1ab8b9ae159bf6048da9e7350d4c2f694912501f.diff
LOG: AMDGPU: Define sub-class of SGPR_64 for tail call return
Summary:
Registers for tail call return should not be clobbered by callee.
So we need a sub-class of SGPR_64 (excluding callee saved registers (CSR)) to hold
the tail call return address.
Because GFX and C calling conventions have different CSR, we need to define
the sub-class separately. This work is an extension of D147096 with the
consideration of GFX calling convention.
Based on the calling conventions, different instructions will be selected with
different sub-class of SGPR_64 as the input.
Reviewers: arsenm, cdevadas and sebastian-ne
Differential Revision: https://reviews.llvm.org/D148824
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/lib/Target/AMDGPU/SIRegisterInfo.td
llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index da647e53bd26..09207319acee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -958,10 +958,14 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
}
static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
- bool IsTailCall) {
+ bool IsTailCall, CallingConv::ID CC) {
assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, "
"because the address can be divergent");
- return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::G_SI_CALL;
+ if (!IsTailCall)
+ return AMDGPU::G_SI_CALL;
+
+ return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
+ AMDGPU::SI_TCRETURN;
}
// Add operands to call instruction to track the callee.
@@ -1186,7 +1190,7 @@ bool AMDGPUCallLowering::lowerTailCall(
if (!IsSibCall)
CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
- unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true);
+ unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true, CalleeCC);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
if (!addCallTargetOperands(MIB, MIRBuilder, Info))
return false;
@@ -1350,7 +1354,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
- unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
+ unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, Info.CallConv);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
MIB.addDef(TRI->getReturnAddressReg(MF));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 197cb466c918..7c25eeaddc6a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4587,6 +4587,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LOOP)
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(TC_RETURN)
+ NODE_NAME_CASE(TC_RETURN_GFX)
NODE_NAME_CASE(TRAP)
NODE_NAME_CASE(RET_GLUE)
NODE_NAME_CASE(RETURN_TO_EPILOG)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index c20c7a72cbed..84dfc376b797 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -367,6 +367,7 @@ enum NodeType : unsigned {
// Function call.
CALL,
TC_RETURN,
+ TC_RETURN_GFX,
TRAP,
// Masked control flow nodes.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index fbde546cf07e..d1e19e69cce3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -85,9 +85,16 @@ def AMDGPUcall : SDNode<"AMDGPUISD::CALL",
SDNPVariadic]
>;
-def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN",
- SDTypeProfile<0, 3, [SDTCisPtrTy<0>]>,
- [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+def AMDGPUTCReturnTP : SDTypeProfile<0, 3, [
+ SDTCisPtrTy<0>
+]>;
+
+def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", AMDGPUTCReturnTP,
+[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
+def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP,
+[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 9ecad3be663e..508517f754d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -134,7 +134,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.addOperand(Dest);
OutMI.addOperand(Src);
return;
- } else if (Opcode == AMDGPU::SI_TCRETURN) {
+ } else if (Opcode == AMDGPU::SI_TCRETURN ||
+ Opcode == AMDGPU::SI_TCRETURN_GFX) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 33d4d0189ee6..a3aa8bd371ab 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3396,7 +3396,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// actual call instruction.
if (IsTailCall) {
MFI.setHasTailCall();
- return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
+ unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ?
+ AMDGPUISD::TC_RETURN_GFX : AMDGPUISD::TC_RETURN;
+ return DAG.getNode(OPC, DL, NodeTys, Ops);
}
// Returns a chain and a flag for retval copy to use.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 38210b75e79a..fb267f026042 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -598,10 +598,9 @@ def SI_CALL : SPseudoInstSI <
let isConvergent = 1;
}
-// Tail call handling pseudo
-def SI_TCRETURN : SPseudoInstSI <(outs),
- (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fp
diff ),
- [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fp
diff )]> {
+class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
+ (ins rc:$src0, unknown:$callee, i32imm:$fp
diff ),
+ [(sd i64:$src0, tglobaladdr:$callee, i32:$fp
diff )]> {
let Size = 4;
let FixedSize = 1;
let isCall = 1;
@@ -614,12 +613,22 @@ def SI_TCRETURN : SPseudoInstSI <(outs),
let isConvergent = 1;
}
+// Tail call handling pseudo
+def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>;
+def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>;
+
// Handle selecting indirect tail calls
def : GCNPat<
(AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fp
diff )),
(SI_TCRETURN CCR_SGPR_64:$src0, (i64 0), i32imm:$fp
diff )
>;
+// Handle selecting indirect tail calls for AMDGPU_gfx
+def : GCNPat<
+ (AMDGPUtc_return_gfx i64:$src0, (i64 0), (i32 timm:$fp
diff )),
+ (SI_TCRETURN_GFX Gfx_CCR_SGPR_64:$src0, (i64 0), i32imm:$fp
diff )
+>;
+
def ADJCALLSTACKUP : SPseudoInstSI<
(outs), (ins i32imm:$amt0, i32imm:$amt1),
[(callseq_start timm:$amt0, timm:$amt1)],
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 060f63158b53..5b8aac40b95b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -817,12 +817,19 @@ def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16],
let HasSGPR = 1;
}
-// CCR (call clobbered registers) SGPR 64-bit registers, currently used for tail call
-// return address only.
-def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, (add (trunc SGPR_64, 16))> {
+// CCR (call clobbered registers) SGPR 64-bit registers
+def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, (add (trunc SGPR_64, 15))> {
let CopyCost = SGPR_64.CopyCost;
let AllocationPriority = SGPR_64.AllocationPriority;
- let HasSGPR = SGPR_64.HasSGPR;
+ let HasSGPR = 1;
+}
+
+// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC
+def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
+ (add (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63]
+ let CopyCost = SGPR_64.CopyCost;
+ let AllocationPriority = SGPR_64.AllocationPriority;
+ let HasSGPR = 1;
}
def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 67e745a8a838..6e95091a6b98 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,15 +8,15 @@
define amdgpu_kernel void @s_input_output_i128() {
; GFX908-LABEL: name: s_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6946826 /* regdef:SGPR_128 */, def %4
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7143434 /* regdef:SGPR_128 */, def %4
; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6946825 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7143433 /* reguse:SGPR_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
; GFX90A-LABEL: name: s_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6946826 /* regdef:SGPR_128 */, def %4
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7143434 /* regdef:SGPR_128 */, def %4
; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6946825 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7143433 /* reguse:SGPR_128 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=s"()
call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -26,15 +26,15 @@ define amdgpu_kernel void @s_input_output_i128() {
define amdgpu_kernel void @v_input_output_i128() {
; GFX908-LABEL: name: v_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5898250 /* regdef:VReg_128 */, def %4
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %4
; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %4
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5898249 /* reguse:VReg_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
; GFX90A-LABEL: name: v_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128_Align2 */, def %4
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %4
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128_Align2 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:VReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=v"()
call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -44,15 +44,15 @@ define amdgpu_kernel void @v_input_output_i128() {
define amdgpu_kernel void @a_input_output_i128() {
; GFX908-LABEL: name: a_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5832714 /* regdef:AReg_128 */, def %4
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6029322 /* regdef:AReg_128 */, def %4
; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %4
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5832713 /* reguse:AReg_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
; GFX90A-LABEL: name: a_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:AReg_128_Align2 */, def %4
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def %4
; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:AReg_128_Align2 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = call i128 asm sideeffect "; def $0", "=a"()
call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 2450a989ecb8..8ae07a74503a 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -11,7 +11,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX908-NEXT: {{ $}}
; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef %5:agpr_32
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5898250 /* regdef:VReg_128 */, def %26
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %26
; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %26
; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64 */, def %23
; REGALLOC-GFX908-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
@@ -35,7 +35,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef renamable $agpr0
- ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5898250 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -58,7 +58,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX90A-NEXT: {{ $}}
; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef %5:agpr_32
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128_Align2 */, def %25
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %25
; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %25
; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64_Align2 */, def %23
; REGALLOC-GFX90A-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
@@ -80,7 +80,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef renamable $agpr0
- ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
index 4059d2d80f89..756182dc243b 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
@@ -2,8 +2,6 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -enable-var-scope %s
; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -enable-var-scope %s
-; FIXME: @caller uses s[4:5] to store the address of @callee.
-; These registers are callee-save in the amdgpu_gfx calling convention, so they must not be clobbered, but they are clobbered here.
; Callee with VGPR arguments
define hidden amdgpu_gfx float @callee(float %v.arg0, float %v.arg1) {
@@ -20,22 +18,12 @@ define amdgpu_gfx float @caller(float %arg0) {
; GCN-LABEL: caller:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[34:35]
-; GCN-NEXT: v_writelane_b32 v2, s4, 0
-; GCN-NEXT: v_writelane_b32 v2, s5, 1
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mov_b32_e32 v1, 2.0
-; GCN-NEXT: v_readlane_b32 s5, v2, 1
-; GCN-NEXT: v_readlane_b32 s4, v2, 0
-; GCN-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[34:35]
-; GCN-NEXT: s_setpc_b64 s[4:5]
+; GCN-NEXT: s_getpc_b64 s[36:37]
+; GCN-NEXT: s_add_u32 s36, s36, callee at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s37, s37, callee at rel32@hi+12
+; GCN-NEXT: s_setpc_b64 s[36:37]
%add = fadd float %arg0, 1.0
%call = tail call amdgpu_gfx float @callee(float %add, float 2.0)
ret float %call
More information about the llvm-commits
mailing list