[llvm] [X86] Elect to tail call when `sret` ptr is passed to the callee (PR #146575)
Antonio Frighetto via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 3 07:39:26 PDT 2025
https://github.com/antoniofrighetto updated https://github.com/llvm/llvm-project/pull/146575
>From 48d65c91902d426396ed573d66742a992b78a807 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me at antoniofrighetto.com>
Date: Thu, 3 Jul 2025 16:10:31 +0200
Subject: [PATCH 1/2] [X86] Precommit tests for PR146575 (NFC)
---
llvm/test/CodeGen/X86/sibcall.ll | 170 +++++++++++++++++++++++++++++++
1 file changed, 170 insertions(+)
diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll
index 4a0a68ee32243..e36b9b895df23 100644
--- a/llvm/test/CodeGen/X86/sibcall.ll
+++ b/llvm/test/CodeGen/X86/sibcall.ll
@@ -977,6 +977,176 @@ define ccc void @t22_non_sret_to_sret(ptr %agg.result) nounwind {
ret void
}
+; Not tailcallable, caller and callee have different return types.
+define void @t23_sret_to_non_sret(ptr noalias sret(%struct.foo) align 4 %agg.result, ptr %arg) {
+; X86-LABEL: t23_sret_to_non_sret:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: .cfi_def_cfa_offset 16
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: calll callee_1 at PLT
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
+; X86-NEXT: retl $4
+;
+; X64-LABEL: t23_sret_to_non_sret:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: .cfi_offset %rbx, -16
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: movq %rsi, %rdi
+; X64-NEXT: callq callee_1 at PLT
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: popq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 8
+; X64-NEXT: retq
+;
+; X32-LABEL: t23_sret_to_non_sret:
+; X32: # %bb.0:
+; X32-NEXT: pushq %rbx
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %rbx, -16
+; X32-NEXT: movq %rdi, %rbx
+; X32-NEXT: movq %rsi, %rdi
+; X32-NEXT: callq callee_1 at PLT
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: popq %rbx
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: retq
+ tail call void @callee_1(ptr %arg)
+ ret void
+}
+
+; Not tailcallable, caller and callee have the same return type, but different return values.
+define void @t24_sret_to_sret_different_val(ptr noalias sret(%struct.foo) align 4 %agg.result, ptr %arg) {
+; X86-LABEL: t24_sret_to_sret_different_val:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: subl $24, %esp
+; X86-NEXT: .cfi_def_cfa_offset 32
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: xorps %xmm0, %xmm0
+; X86-NEXT: movsd %xmm0, 8(%esi)
+; X86-NEXT: movsd %xmm0, (%esi)
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: calll callee_2 at PLT
+; X86-NEXT: subl $4, %esp
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl $24, %esp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
+; X86-NEXT: retl $4
+;
+; X64-LABEL: t24_sret_to_sret_different_val:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: subq $16, %rsp
+; X64-NEXT: .cfi_def_cfa_offset 32
+; X64-NEXT: .cfi_offset %rbx, -16
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: movq $0, 8(%rdi)
+; X64-NEXT: movq $0, (%rdi)
+; X64-NEXT: movq %rsp, %rdi
+; X64-NEXT: callq callee_2 at PLT
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: addq $16, %rsp
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: popq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 8
+; X64-NEXT: retq
+;
+; X32-LABEL: t24_sret_to_sret_different_val:
+; X32: # %bb.0:
+; X32-NEXT: pushq %rbx
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: .cfi_def_cfa_offset 32
+; X32-NEXT: .cfi_offset %rbx, -16
+; X32-NEXT: movq %rdi, %rbx
+; X32-NEXT: movq $0, 8(%ebx)
+; X32-NEXT: movq $0, (%ebx)
+; X32-NEXT: movl %esp, %edi
+; X32-NEXT: callq callee_2 at PLT
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: addl $16, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: popq %rbx
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: retq
+ %callee.return = alloca %struct.foo, align 4
+ tail call void @llvm.memset.p0.i64(ptr align 4 %agg.result, i8 0, i64 16, i1 false)
+ tail call void @callee_2(ptr sret(%struct.foo) align 4 %callee.return)
+ ret void
+}
+
+; Not tailcallable, caller and callee have the same return type, but different return values.
+define void @t25_sret_to_sret_different_val(ptr noalias sret(%struct.foo) align 8 %agg.result, ptr %arg) {
+; X86-LABEL: t25_sret_to_sret_different_val:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: .cfi_def_cfa_offset 16
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: calll callee_2 at PLT
+; X86-NEXT: subl $4, %esp
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
+; X86-NEXT: retl $4
+;
+; X64-LABEL: t25_sret_to_sret_different_val:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: .cfi_offset %rbx, -16
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: movq %rsi, %rdi
+; X64-NEXT: callq callee_2 at PLT
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: popq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 8
+; X64-NEXT: retq
+;
+; X32-LABEL: t25_sret_to_sret_different_val:
+; X32: # %bb.0:
+; X32-NEXT: pushq %rbx
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %rbx, -16
+; X32-NEXT: movq %rdi, %rbx
+; X32-NEXT: movq %rsi, %rdi
+; X32-NEXT: callq callee_2 at PLT
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: popq %rbx
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: retq
+ tail call void @callee_2(ptr sret(%struct.foo) align 8 %arg)
+ ret void
+}
+
+declare void @llvm.memset.p0.i64(ptr, i8, i64, i1)
+declare void @callee_1(ptr)
+declare void @callee_2(ptr noalias sret(%struct.foo))
+
declare dso_local fastcc void @t21_f_sret(ptr noalias sret(%struct.foo)) nounwind
declare dso_local fastcc void @t21_f_sret2(ptr noalias sret(%struct.foo), ptr noalias) nounwind
declare dso_local fastcc void @t21_f_non_sret(ptr) nounwind
>From 434269861863dfec4619a57e1d3293521379ef17 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me at antoniofrighetto.com>
Date: Thu, 3 Jul 2025 16:12:31 +0200
Subject: [PATCH 2/2] [X86] Elect to tail call when `sret` ptr is passed to the
callee
We may be able to allow the callee to be tail-called when the caller
expects a `sret` pointer argument, as long as this pointer is forwarded
to the callee.
Fixes: https://github.com/llvm/llvm-project/issues/146303.
---
llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 47 ++++++++--
llvm/test/CodeGen/X86/sibcall.ll | 99 +++------------------
2 files changed, 53 insertions(+), 93 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index cb38a39ff991d..2f0923693347a 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2767,6 +2767,38 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
return Bytes == MFI.getObjectSize(FI);
}
+static bool
+mayBeSRetTailCallCompatible(const TargetLowering::CallLoweringInfo &CLI,
+ Register CallerSRetReg) {
+ const auto &Outs = CLI.Outs;
+ const auto &OutVals = CLI.OutVals;
+
+ // We know the caller has a sret pointer argument (CallerSRetReg). Locate the
+ // operand index within the callee that may have a sret pointer too.
+ unsigned Pos = 0;
+ for (unsigned E = Outs.size(); Pos != E; ++Pos)
+ if (Outs[Pos].Flags.isSRet())
+ break;
+ // Bail out if the callee has not any sret argument.
+ if (Pos == Outs.size())
+ return false;
+
+ // At this point, either the caller is forwarding its sret argument to the
+ // callee, or the callee is being passed a different sret pointer. We now look
+ // for a CopyToReg, where the callee sret argument is written into a new vreg
+ // (which should later be %rax/%eax, if this is returned).
+ SDValue SRetArgVal = OutVals[Pos];
+ for (SDNode *User : SRetArgVal->users()) {
+ if (User->getOpcode() != ISD::CopyToReg)
+ continue;
+ Register Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+ if (Reg == CallerSRetReg && User->getOperand(2) == SRetArgVal)
+ return true;
+ }
+
+ return false;
+}
+
/// Check whether the call is eligible for tail call optimization. Targets
/// that want to do tail call optimization should implement this function.
/// Note that the x86 backend does not check musttail calls for eligibility! The
@@ -2788,6 +2820,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
// If -tailcallopt is specified, make fastcc functions tail-callable.
MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const Function &CallerF = MF.getFunction();
// If the function return type is x86_fp80 and the callee return type is not,
@@ -2824,14 +2857,15 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
if (RegInfo->hasStackRealignment(MF))
return false;
- // Also avoid sibcall optimization if we're an sret return fn and the callee
- // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
- // insufficient.
- if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
+ // Avoid sibcall optimization if we are an sret return function and the callee
+ // is incompatible, unless such premises are proven wrong. See comment in
+ // LowerReturn about why hasStructRetAttr is insufficient.
+ if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
// For a compatible tail call the callee must return our sret pointer. So it
// needs to be (a) an sret function itself and (b) we pass our sret as its
// sret. Condition #b is harder to determine.
- return false;
+ if (!mayBeSRetTailCallCompatible(CLI, SRetReg))
+ return false;
} else if (IsCalleePopSRet)
// The callee pops an sret, so we cannot tail-call, as our caller doesn't
// expect that.
@@ -2953,8 +2987,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt);
- if (unsigned BytesToPop =
- MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
+ if (unsigned BytesToPop = FuncInfo->getBytesToPopOnReturn()) {
// If we have bytes to pop, the callee must pop them.
bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
if (!CalleePopMatches)
diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll
index e36b9b895df23..2759a9883975e 100644
--- a/llvm/test/CodeGen/X86/sibcall.ll
+++ b/llvm/test/CodeGen/X86/sibcall.ll
@@ -444,21 +444,11 @@ define dso_local void @t15(ptr noalias sret(%struct.foo) %agg.result) nounwind
;
; X64-LABEL: t15:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: callq f
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: popq %rbx
-; X64-NEXT: retq
+; X64-NEXT: jmp f # TAILCALL
;
; X32-LABEL: t15:
; X32: # %bb.0:
-; X32-NEXT: pushq %rbx
-; X32-NEXT: movq %rdi, %rbx
-; X32-NEXT: callq f
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: popq %rbx
-; X32-NEXT: retq
+; X32-NEXT: jmp f # TAILCALL
tail call fastcc void @f(ptr noalias sret(%struct.foo) %agg.result) nounwind
ret void
}
@@ -607,32 +597,15 @@ declare dso_local fastcc double @foo20(double) nounwind
define fastcc void @t21_sret_to_sret(ptr noalias sret(%struct.foo) %agg.result) nounwind {
; X86-LABEL: t21_sret_to_sret:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: calll t21_f_sret
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
+; X86-NEXT: jmp t21_f_sret # TAILCALL
;
; X64-LABEL: t21_sret_to_sret:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: callq t21_f_sret
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: popq %rbx
-; X64-NEXT: retq
+; X64-NEXT: jmp t21_f_sret # TAILCALL
;
; X32-LABEL: t21_sret_to_sret:
; X32: # %bb.0:
-; X32-NEXT: pushq %rbx
-; X32-NEXT: movq %rdi, %rbx
-; X32-NEXT: callq t21_f_sret
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: popq %rbx
-; X32-NEXT: retq
+; X32-NEXT: jmp t21_f_sret # TAILCALL
tail call fastcc void @t21_f_sret(ptr noalias sret(%struct.foo) %agg.result) nounwind
ret void
}
@@ -640,34 +613,15 @@ define fastcc void @t21_sret_to_sret(ptr noalias sret(%struct.foo) %agg.result)
define fastcc void @t21_sret_to_sret_more_args(ptr noalias sret(%struct.foo) %agg.result, i32 %a, i32 %b) nounwind {
; X86-LABEL: t21_sret_to_sret_more_args:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: calll f_sret at PLT
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
+; X86-NEXT: jmp f_sret at PLT # TAILCALL
;
; X64-LABEL: t21_sret_to_sret_more_args:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: callq f_sret at PLT
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: popq %rbx
-; X64-NEXT: retq
+; X64-NEXT: jmp f_sret at PLT # TAILCALL
;
; X32-LABEL: t21_sret_to_sret_more_args:
; X32: # %bb.0:
-; X32-NEXT: pushq %rbx
-; X32-NEXT: movq %rdi, %rbx
-; X32-NEXT: callq f_sret at PLT
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: popq %rbx
-; X32-NEXT: retq
+; X32-NEXT: jmp f_sret at PLT # TAILCALL
tail call fastcc void @f_sret(ptr noalias sret(%struct.foo) %agg.result, i32 %a, i32 %b) nounwind
ret void
}
@@ -675,35 +629,18 @@ define fastcc void @t21_sret_to_sret_more_args(ptr noalias sret(%struct.foo) %ag
define fastcc void @t21_sret_to_sret_second_arg_sret(ptr noalias %agg.result, ptr noalias sret(%struct.foo) %ret) nounwind {
; X86-LABEL: t21_sret_to_sret_second_arg_sret:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: calll t21_f_sret
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
+; X86-NEXT: jmp t21_f_sret # TAILCALL
;
; X64-LABEL: t21_sret_to_sret_second_arg_sret:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rsi, %rbx
; X64-NEXT: movq %rsi, %rdi
-; X64-NEXT: callq t21_f_sret
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: popq %rbx
-; X64-NEXT: retq
+; X64-NEXT: jmp t21_f_sret # TAILCALL
;
; X32-LABEL: t21_sret_to_sret_second_arg_sret:
; X32: # %bb.0:
-; X32-NEXT: pushq %rbx
-; X32-NEXT: movq %rsi, %rbx
; X32-NEXT: movq %rsi, %rdi
-; X32-NEXT: callq t21_f_sret
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: popq %rbx
-; X32-NEXT: retq
+; X32-NEXT: jmp t21_f_sret # TAILCALL
tail call fastcc void @t21_f_sret(ptr noalias sret(%struct.foo) %ret) nounwind
ret void
}
@@ -725,27 +662,17 @@ define fastcc void @t21_sret_to_sret_more_args2(ptr noalias sret(%struct.foo) %a
;
; X64-LABEL: t21_sret_to_sret_more_args2:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbx
; X64-NEXT: movl %esi, %eax
-; X64-NEXT: movq %rdi, %rbx
; X64-NEXT: movl %edx, %esi
; X64-NEXT: movl %eax, %edx
-; X64-NEXT: callq f_sret at PLT
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: popq %rbx
-; X64-NEXT: retq
+; X64-NEXT: jmp f_sret at PLT # TAILCALL
;
; X32-LABEL: t21_sret_to_sret_more_args2:
; X32: # %bb.0:
-; X32-NEXT: pushq %rbx
; X32-NEXT: movl %esi, %eax
-; X32-NEXT: movq %rdi, %rbx
; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %edx
-; X32-NEXT: callq f_sret at PLT
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: popq %rbx
-; X32-NEXT: retq
+; X32-NEXT: jmp f_sret at PLT # TAILCALL
tail call fastcc void @f_sret(ptr noalias sret(%struct.foo) %agg.result, i32 %b, i32 %a) nounwind
ret void
}
More information about the llvm-commits
mailing list