[llvm] 78ec2e2 - [ARM] Allow tail calls with byval args
Oliver Stannard via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 25 01:34:59 PDT 2024
Author: Oliver Stannard
Date: 2024-10-25T09:34:08+01:00
New Revision: 78ec2e2ed5e3dc61339b51f9d614029314a42005
URL: https://github.com/llvm/llvm-project/commit/78ec2e2ed5e3dc61339b51f9d614029314a42005
DIFF: https://github.com/llvm/llvm-project/commit/78ec2e2ed5e3dc61339b51f9d614029314a42005.diff
LOG: [ARM] Allow tail calls with byval args
Byval arguments which are passed partially in registers get stored into
the local stack frame, but it is valid to tail-call them because the
part which gets spilled is always re-loaded into registers before doing
the tail-call, so it's OK for the spill area to be deallocated.
Added:
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll
llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
llvm/test/CodeGen/ARM/musttail.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 4ccfe1b8cd1a96..78d2b98b0a84a1 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3075,11 +3075,11 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
}
}
- // If Caller's vararg or byval argument has been split between registers and
- // stack, do not perform tail call, since part of the argument is in caller's
- // local frame.
+ // If Caller's vararg argument has been split between registers and stack, do
+ // not perform tail call, since part of the argument is in caller's local
+ // frame.
const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
- if (AFI_Caller->getArgRegsSaveSize()) {
+ if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
return false;
}
diff --git a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll
index d8e22f4f5312ae..e186ae3a961502 100644
--- a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll
+++ b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll
@@ -12,17 +12,11 @@ define void @check227(
; arg1 --> SP+188
entry:
-
-;CHECK: sub sp, sp, #12
-;CHECK: push {r11, lr}
-;CHECK: sub sp, sp, #4
-;CHECK: add r0, sp, #12
-;CHECK: stm r0, {r1, r2, r3}
-;CHECK: ldr r0, [sp, #212]
-;CHECK: bl useInt
-;CHECK: add sp, sp, #4
-;CHECK: pop {r11, lr}
-;CHECK: add sp, sp, #12
+; CHECK: sub sp, sp, #12
+; CHECK: stm sp, {r1, r2, r3}
+; CHECK: ldr r0, [sp, #200]
+; CHECK: add sp, sp, #12
+; CHECK: b useInt
%0 = ptrtoint ptr %arg1 to i32
tail call void @useInt(i32 %0)
diff --git a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
index 0c5d22984b99e1..efdecce9ae723a 100644
--- a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
+++ b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
@@ -7,14 +7,11 @@
define void @foo(ptr byval(%struct4bytes) %p0, ; --> R0
ptr byval(%struct20bytes) %p1 ; --> R1,R2,R3, [SP+0 .. SP+8)
) {
-;CHECK: sub sp, sp, #16
-;CHECK: push {r11, lr}
-;CHECK: add r12, sp, #8
-;CHECK: stm r12, {r0, r1, r2, r3}
-;CHECK: add r0, sp, #12
-;CHECK: bl useInt
-;CHECK: pop {r11, lr}
-;CHECK: add sp, sp, #16
+;CHECK: sub sp, sp, #16
+;CHECK: stm sp, {r0, r1, r2, r3}
+;CHECK: add r0, sp, #4
+;CHECK: add sp, sp, #16
+;CHECK: b useInt
%1 = ptrtoint ptr %p1 to i32
tail call void @useInt(i32 %1)
diff --git a/llvm/test/CodeGen/ARM/musttail.ll b/llvm/test/CodeGen/ARM/musttail.ll
index 6db45aa9e6285a..aecf8e4579b5df 100644
--- a/llvm/test/CodeGen/ARM/musttail.ll
+++ b/llvm/test/CodeGen/ARM/musttail.ll
@@ -117,3 +117,115 @@ entry:
musttail call void @sret_callee(ptr sret({ double, double }) align 8 %result)
ret void
}
+
+; Clang only uses byval for arguments of 65 bytes or larger, but we test with a
+; 20 byte struct to keep the tests more readable. This size was chosen to still
+; make sure that it will be split between registers and the stack, to test all
+; of the interesting code paths in the backend.
+%twenty_bytes = type { [5 x i32] }
+declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4)
+
+; Functions with byval parameters can be tail-called, because the value is
+; actually passed in registers and the stack in the same way for the caller and
+; callee. Within @large_caller the first 16 bytes of the argument are spilled
+; to the local stack frame, but for the tail-call they are passed in r0-r3, so
+; it's safe to de-allocate that memory before the call. Most of the code
+; generated for this isn't needed, but that's a missed optimisation, not a
+; correctness issue.
+define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; CHECK-LABEL: large_caller:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: add r12, sp, #8
+; CHECK-NEXT: add lr, sp, #24
+; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
+; CHECK-NEXT: add r12, sp, #8
+; CHECK-NEXT: add r12, r12, #16
+; CHECK-NEXT: ldr r4, [r12], #4
+; CHECK-NEXT: str r4, [lr], #4
+; CHECK-NEXT: pop {r4, lr}
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b large_callee
+entry:
+ musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a)
+ ret void
+}
+
+; As above, but with some inline asm to test that the arguments in r0-r3 are
+; re-loaded before the call.
+define void @large_caller_check_regs(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; CHECK-LABEL: large_caller_check_regs:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: add r12, sp, #8
+; CHECK-NEXT: add lr, sp, #24
+; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
+; CHECK-NEXT: @APP
+; CHECK-NEXT: @NO_APP
+; CHECK-NEXT: add r3, sp, #8
+; CHECK-NEXT: add r0, sp, #8
+; CHECK-NEXT: add r12, r0, #16
+; CHECK-NEXT: ldm r3, {r0, r1, r2, r3}
+; CHECK-NEXT: ldr r4, [r12], #4
+; CHECK-NEXT: str r4, [lr], #4
+; CHECK-NEXT: pop {r4, lr}
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b large_callee
+entry:
+ tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+ musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a)
+ ret void
+}
+
+; The IR for this one looks dodgy, because it has an alloca passed to a
+; musttail function, but it is passed as a byval argument, so will be copied
+; into the stack space allocated by @large_caller_new_value's caller, so is
+; valid.
+define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; CHECK-LABEL: large_caller_new_value:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .pad #36
+; CHECK-NEXT: sub sp, sp, #36
+; CHECK-NEXT: add r12, sp, #20
+; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
+; CHECK-NEXT: mov r0, #4
+; CHECK-NEXT: add r1, sp, #36
+; CHECK-NEXT: str r0, [sp, #16]
+; CHECK-NEXT: mov r0, #3
+; CHECK-NEXT: str r0, [sp, #12]
+; CHECK-NEXT: mov r0, #2
+; CHECK-NEXT: str r0, [sp, #8]
+; CHECK-NEXT: mov r0, #1
+; CHECK-NEXT: str r0, [sp, #4]
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: str r0, [sp]
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: add r0, r0, #16
+; CHECK-NEXT: mov r3, #3
+; CHECK-NEXT: ldr r2, [r0], #4
+; CHECK-NEXT: str r2, [r1], #4
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: mov r1, #1
+; CHECK-NEXT: mov r2, #2
+; CHECK-NEXT: add sp, sp, #36
+; CHECK-NEXT: b large_callee
+entry:
+ %y = alloca %twenty_bytes, align 4
+ store i32 0, ptr %y, align 4
+ %0 = getelementptr inbounds i8, ptr %y, i32 4
+ store i32 1, ptr %0, align 4
+ %1 = getelementptr inbounds i8, ptr %y, i32 8
+ store i32 2, ptr %1, align 4
+ %2 = getelementptr inbounds i8, ptr %y, i32 12
+ store i32 3, ptr %2, align 4
+ %3 = getelementptr inbounds i8, ptr %y, i32 16
+ store i32 4, ptr %3, align 4
+ musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y)
+ ret void
+}
More information about the llvm-commits
mailing list