[llvm] c1eb790 - [ARM] Tail-calls do not require caller and callee arguments to match
Oliver Stannard via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 25 01:34:56 PDT 2024
Author: Oliver Stannard
Date: 2024-10-25T09:34:08+01:00
New Revision: c1eb790cd2f2a3fd48781167b50f091c0d20be8d
URL: https://github.com/llvm/llvm-project/commit/c1eb790cd2f2a3fd48781167b50f091c0d20be8d
DIFF: https://github.com/llvm/llvm-project/commit/c1eb790cd2f2a3fd48781167b50f091c0d20be8d.diff
LOG: [ARM] Tail-calls do not require caller and callee arguments to match
The ARM backend was checking that the outgoing values for a tail-call
matched the incoming argument values of the caller. This isn't
necessary, because the caller can change the values in both registers
and the stack before doing the tail-call. The actual limitation is that
the callee can't need more stack space for it's arguments than the
caller does.
This is needed for code using the musttail attribute, as well as
enabling tail calls as an optimisation in more cases.
Added:
llvm/test/CodeGen/ARM/musttail.ll
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/test/CodeGen/ARM/fp-arg-shuffle.ll
llvm/test/CodeGen/ARM/fp16-vector-argument.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 25c74f52bd8706..222f08dab03f72 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2962,50 +2962,6 @@ void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
Size = std::max<int>(Size - Excess, 0);
}
-/// MatchingStackOffset - Return true if the given stack call argument is
-/// already available in the same position (relatively) of the caller's
-/// incoming argument stack.
-static
-bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
- MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
- const TargetInstrInfo *TII) {
- unsigned Bytes = Arg.getValueSizeInBits() / 8;
- int FI = std::numeric_limits<int>::max();
- if (Arg.getOpcode() == ISD::CopyFromReg) {
- Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
- if (!VR.isVirtual())
- return false;
- MachineInstr *Def = MRI->getVRegDef(VR);
- if (!Def)
- return false;
- if (!Flags.isByVal()) {
- if (!TII->isLoadFromStackSlot(*Def, FI))
- return false;
- } else {
- return false;
- }
- } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
- if (Flags.isByVal())
- // ByVal argument is passed in as a pointer but it's now being
- // dereferenced. e.g.
- // define @foo(%struct.X* %A) {
- // tail call @bar(%struct.X* byval %A)
- // }
- return false;
- SDValue Ptr = Ld->getBasePtr();
- FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
- if (!FINode)
- return false;
- FI = FINode->getIndex();
- } else
- return false;
-
- assert(FI != std::numeric_limits<int>::max());
- if (!MFI.isFixedObjectIndex(FI))
- return false;
- return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
-}
-
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function. Note that this function also
@@ -3130,64 +3086,17 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
// If the callee takes no arguments then go on to check the results of the
// call.
- if (!Outs.empty()) {
- if (CCInfo.getStackSize()) {
- // Check if the arguments are already laid out in the right way as
- // the caller's fixed stack objects.
- MachineFrameInfo &MFI = MF.getFrameInfo();
- const MachineRegisterInfo *MRI = &MF.getRegInfo();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
- i != e;
- ++i, ++realArgIdx) {
- CCValAssign &VA = ArgLocs[i];
- EVT RegVT = VA.getLocVT();
- SDValue Arg = OutVals[realArgIdx];
- ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
- if (VA.getLocInfo() == CCValAssign::Indirect) {
- LLVM_DEBUG(dbgs() << "false (indirect arg)\n");
- return false;
- }
- if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
- // f64 and vector types are split into multiple registers or
- // register/stack-slot combinations. The types will not match
- // the registers; give up on memory f64 refs until we figure
- // out what to do about this.
- if (!VA.isRegLoc()) {
- LLVM_DEBUG(dbgs() << "false (f64 not in register)\n");
- return false;
- }
- if (!ArgLocs[++i].isRegLoc()) {
- LLVM_DEBUG(dbgs() << "false (f64 not in register, second half)\n");
- return false;
- }
- if (RegVT == MVT::v2f64) {
- if (!ArgLocs[++i].isRegLoc()) {
- LLVM_DEBUG(dbgs() << "false (v2f64 not in register)\n");
- return false;
- }
- if (!ArgLocs[++i].isRegLoc()) {
- LLVM_DEBUG(dbgs() << "false (v2f64 not in register, second half)\n");
- return false;
- }
- }
- } else if (!VA.isRegLoc()) {
- if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
- MFI, MRI, TII)) {
- LLVM_DEBUG(dbgs() << "false (non-matching stack offset)\n");
- return false;
- }
- }
- }
- }
-
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
- LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
- return false;
- }
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
+ LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
+ return false;
}
+ // If the stack arguments for this call do not fit into our own save area then
+ // the call cannot be made tail.
+ if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
+ return false;
+
LLVM_DEBUG(dbgs() << "true\n");
return true;
}
diff --git a/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll b/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll
index 36f5a4b30af409..1e46b081acfdf9 100644
--- a/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll
+++ b/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll
@@ -2,31 +2,29 @@
; RUN: llc -mtriple=arm-eabi -mattr=+neon -float-abi=soft %s -o - | FileCheck %s
; CHECK: function1
-; CHECK-NOT: vmov
define double @function1(double %a, double %b, double %c, double %d, double %e, double %f) nounwind noinline ssp {
; CHECK-LABEL: function1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r11, lr}
; CHECK-NEXT: push {r4, r5, r11, lr}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: add lr, sp, #64
-; CHECK-NEXT: vldr d16, [sp, #56]
-; CHECK-NEXT: str r2, [sp, #16]
-; CHECK-NEXT: ldm lr, {r4, r5, r12, lr}
-; CHECK-NEXT: str r3, [sp, #20]
-; CHECK-NEXT: mov r3, r5
-; CHECK-NEXT: str r0, [sp, #24]
+; CHECK-NEXT: vldr d16, [sp, #40]
+; CHECK-NEXT: vldr d17, [sp, #32]
+; CHECK-NEXT: vmov r12, lr, d16
+; CHECK-NEXT: vldr d16, [sp, #16]
+; CHECK-NEXT: vmov r4, r5, d17
+; CHECK-NEXT: vldr d17, [sp, #24]
+; CHECK-NEXT: str r3, [sp, #36]
+; CHECK-NEXT: str r2, [sp, #32]
+; CHECK-NEXT: str r1, [sp, #44]
+; CHECK-NEXT: str r0, [sp, #40]
+; CHECK-NEXT: vstr d17, [sp, #16]
+; CHECK-NEXT: vstr d16, [sp, #24]
; CHECK-NEXT: mov r0, r12
-; CHECK-NEXT: str r1, [sp, #28]
; CHECK-NEXT: mov r1, lr
; CHECK-NEXT: mov r2, r4
-; CHECK-NEXT: vldr d17, [sp, #48]
-; CHECK-NEXT: vstmia sp, {d16, d17}
-; CHECK-NEXT: bl function2
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: mov r3, r5
; CHECK-NEXT: pop {r4, r5, r11, lr}
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: b function2
entry:
%call = tail call double @function2(double %f, double %e, double %d, double %c, double %b, double %a) nounwind
ret double %call
diff --git a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
index 6fc56967bc7aa9..65aff46658fd1d 100644
--- a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
+++ b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
@@ -145,26 +145,21 @@ entry:
define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x half>, <8 x half>) {
; SOFT-LABEL: many_args_test:
; SOFT: @ %bb.0: @ %entry
-; SOFT-NEXT: push {r11, lr}
-; SOFT-NEXT: sub sp, sp, #32
-; SOFT-NEXT: add r12, sp, #80
+; SOFT-NEXT: add r12, sp, #40
; SOFT-NEXT: vld1.64 {d16, d17}, [r12]
-; SOFT-NEXT: add r12, sp, #48
+; SOFT-NEXT: add r12, sp, #8
; SOFT-NEXT: vabs.f16 q8, q8
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
-; SOFT-NEXT: add r12, sp, #64
+; SOFT-NEXT: add r12, sp, #24
; SOFT-NEXT: vadd.f16 q8, q8, q9
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
; SOFT-NEXT: add r12, sp, #16
; SOFT-NEXT: vmul.f16 q8, q9, q8
; SOFT-NEXT: vst1.64 {d16, d17}, [r12]
-; SOFT-NEXT: mov r12, sp
-; SOFT-NEXT: vldr d16, [sp, #40]
-; SOFT-NEXT: vst1.16 {d16}, [r12:64]!
-; SOFT-NEXT: str r3, [r12]
-; SOFT-NEXT: bl use
-; SOFT-NEXT: add sp, sp, #32
-; SOFT-NEXT: pop {r11, pc}
+; SOFT-NEXT: vldr d16, [sp]
+; SOFT-NEXT: vstr d16, [sp]
+; SOFT-NEXT: str r3, [sp, #8]
+; SOFT-NEXT: b use
;
; HARD-LABEL: many_args_test:
; HARD: @ %bb.0: @ %entry
@@ -177,33 +172,25 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
;
; SOFTEB-LABEL: many_args_test:
; SOFTEB: @ %bb.0: @ %entry
-; SOFTEB-NEXT: .save {r11, lr}
-; SOFTEB-NEXT: push {r11, lr}
-; SOFTEB-NEXT: .pad #32
-; SOFTEB-NEXT: sub sp, sp, #32
-; SOFTEB-NEXT: add r12, sp, #80
-; SOFTEB-NEXT: mov lr, sp
+; SOFTEB-NEXT: add r12, sp, #40
; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12]
-; SOFTEB-NEXT: add r12, sp, #48
+; SOFTEB-NEXT: add r12, sp, #8
; SOFTEB-NEXT: vrev64.16 q8, q8
; SOFTEB-NEXT: vabs.f16 q8, q8
; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12]
-; SOFTEB-NEXT: add r12, sp, #64
+; SOFTEB-NEXT: add r12, sp, #24
; SOFTEB-NEXT: vrev64.16 q9, q9
; SOFTEB-NEXT: vadd.f16 q8, q8, q9
; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12]
; SOFTEB-NEXT: add r12, sp, #16
; SOFTEB-NEXT: vrev64.16 q9, q9
; SOFTEB-NEXT: vmul.f16 q8, q9, q8
-; SOFTEB-NEXT: vldr d18, [sp, #40]
-; SOFTEB-NEXT: vrev64.16 d18, d18
-; SOFTEB-NEXT: vst1.16 {d18}, [lr:64]!
-; SOFTEB-NEXT: str r3, [lr]
+; SOFTEB-NEXT: vldr d18, [sp]
; SOFTEB-NEXT: vrev64.16 q8, q8
; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12]
-; SOFTEB-NEXT: bl use
-; SOFTEB-NEXT: add sp, sp, #32
-; SOFTEB-NEXT: pop {r11, pc}
+; SOFTEB-NEXT: vstr d18, [sp]
+; SOFTEB-NEXT: str r3, [sp, #8]
+; SOFTEB-NEXT: b use
;
; HARDEB-LABEL: many_args_test:
; HARDEB: @ %bb.0: @ %entry
diff --git a/llvm/test/CodeGen/ARM/musttail.ll b/llvm/test/CodeGen/ARM/musttail.ll
new file mode 100644
index 00000000000000..622bea3f876351
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/musttail.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=armv7a-none-eabi %s -o - | FileCheck %s
+
+declare i32 @many_args_callee(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5)
+
+define i32 @many_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) {
+; CHECK-LABEL: many_args_tail:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r0, #5
+; CHECK-NEXT: mov r1, #2
+; CHECK-NEXT: str r0, [sp]
+; CHECK-NEXT: mov r0, #6
+; CHECK-NEXT: str r0, [sp, #4]
+; CHECK-NEXT: mov r0, #1
+; CHECK-NEXT: mov r2, #3
+; CHECK-NEXT: mov r3, #4
+; CHECK-NEXT: b many_args_callee
+ %ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
+ ret i32 %ret
+}
+
+define i32 @many_args_musttail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) {
+; CHECK-LABEL: many_args_musttail:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r0, #5
+; CHECK-NEXT: mov r1, #2
+; CHECK-NEXT: str r0, [sp]
+; CHECK-NEXT: mov r0, #6
+; CHECK-NEXT: str r0, [sp, #4]
+; CHECK-NEXT: mov r0, #1
+; CHECK-NEXT: mov r2, #3
+; CHECK-NEXT: mov r3, #4
+; CHECK-NEXT: b many_args_callee
+ %ret = musttail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
+ ret i32 %ret
+}
+
+; This function has more arguments than it's tail-callee. This isn't valid for
+; the musttail attribute, but can still be tail-called as a non-guaranteed
+; optimisation, because the outgoing arguments to @many_args_callee fit in the
+; stack space allocated by the caller of @more_args_tail.
+define i32 @more_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
+; CHECK-LABEL: more_args_tail:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r0, #5
+; CHECK-NEXT: mov r1, #2
+; CHECK-NEXT: str r0, [sp]
+; CHECK-NEXT: mov r0, #6
+; CHECK-NEXT: str r0, [sp, #4]
+; CHECK-NEXT: mov r0, #1
+; CHECK-NEXT: mov r2, #3
+; CHECK-NEXT: mov r3, #4
+; CHECK-NEXT: b many_args_callee
+ %ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
+ ret i32 %ret
+}
+
+; Again, this isn't valid for musttail, but can be tail-called in practice
+; because the stack size if the same.
+define i32 @
diff erent_args_tail(i64 %0, i64 %1, i64 %2) {
+; CHECK-LABEL:
diff erent_args_tail:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r0, #5
+; CHECK-NEXT: mov r1, #2
+; CHECK-NEXT: str r0, [sp]
+; CHECK-NEXT: mov r0, #6
+; CHECK-NEXT: str r0, [sp, #4]
+; CHECK-NEXT: mov r0, #1
+; CHECK-NEXT: mov r2, #3
+; CHECK-NEXT: mov r3, #4
+; CHECK-NEXT: b many_args_callee
+ %ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
+ ret i32 %ret
+}
+
+; Here, the caller requires less stack space for it's arguments than the
+; callee, so it would not ba valid to do a tail-call.
+define i32 @fewer_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4) {
+; CHECK-LABEL: fewer_args_tail:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: .pad #8
+; CHECK-NEXT: sub sp, sp, #8
+; CHECK-NEXT: mov r1, #6
+; CHECK-NEXT: mov r0, #5
+; CHECK-NEXT: strd r0, r1, [sp]
+; CHECK-NEXT: mov r0, #1
+; CHECK-NEXT: mov r1, #2
+; CHECK-NEXT: mov r2, #3
+; CHECK-NEXT: mov r3, #4
+; CHECK-NEXT: bl many_args_callee
+; CHECK-NEXT: add sp, sp, #8
+; CHECK-NEXT: pop {r11, pc}
+ %ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
+ ret i32 %ret
+}
More information about the llvm-commits
mailing list