[llvm-branch-commits] [llvm] release/22.x: [LoongArch] Fix musttail with indirect arguments by forwarding incoming pointers (#198965) (PR #199637)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue May 26 02:02:43 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-loongarch
Author: llvmbot
<details>
<summary>Changes</summary>
Backport 19e915fc5c91645ccc4050180e9daabec30358c4
Requested by: @<!-- -->heiher
---
Patch is 51.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/199637.diff
4 Files Affected:
- (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+183-44)
- (modified) llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h (+17)
- (added) llvm/test/CodeGen/LoongArch/musttail-call.ll (+20)
- (added) llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll (+907)
``````````diff
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 2cfe3b2bc1a99..7d3d333efe046 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -8172,9 +8172,22 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
"GHC calling convention requires the F and D extensions");
}
+ const Function &Func = MF.getFunction();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
MVT GRLenVT = Subtarget.getGRLenVT();
unsigned GRLenInBytes = Subtarget.getGRLen() / 8;
+
+ // Check if this function has any musttail calls. If so, incoming indirect
+ // arg pointers must be saved in virtual registers so they survive across
+ // basic blocks (the SelectionDAG is cleared between BBs). Only do this
+ // when needed to avoid adding register pressure to non-musttail functions.
+ bool HasMusttail = llvm::any_of(Func, [](const BasicBlock &BB) {
+ return llvm::any_of(BB, [](const Instruction &I) {
+ if (const auto *CI = dyn_cast<CallInst>(&I))
+ return CI->isMustTailCall();
+ return false;
+ });
+ });
// Used with varargs to acumulate store chains.
std::vector<SDValue> OutChains;
@@ -8205,6 +8218,14 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
MachinePointerInfo()));
unsigned ArgIndex = Ins[InsIdx].OrigArgIndex;
+ if (HasMusttail) {
+ LoongArchMachineFunctionInfo *LAFI =
+ MF.getInfo<LoongArchMachineFunctionInfo>();
+ Register VReg =
+ MF.getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+ Chain = DAG.getCopyToReg(Chain, DL, VReg, ArgValue);
+ LAFI->setIncomingIndirectArg(ArgIndex, VReg);
+ }
unsigned ArgPartOffset = Ins[InsIdx].PartOffset;
assert(ArgPartOffset == 0);
while (i + 1 != e && Ins[InsIdx + 1].OrigArgIndex == ArgIndex) {
@@ -8335,6 +8356,27 @@ bool LoongArchTargetLowering::isEligibleForTailCallOptimization(
auto &Caller = MF.getFunction();
auto CallerCC = Caller.getCallingConv();
+ bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
+
+ // Byval parameters hand the function a pointer directly into the stack area
+ // we want to reuse during a tail call. Working around this *is* possible
+ // but less efficient and uglier in LowerCall. For musttail, there is no
+ // workaround today: a byval arg requires a local copy that becomes invalid
+ // after the tail call deallocates the caller's frame, so rejecting here
+ // (and triggering reportFatalInternalError in LowerCall) is safer than
+ // miscompiling.
+ for (auto &Arg : Outs)
+ if (Arg.Flags.isByVal())
+ return false;
+
+ // musttail bypasses the remaining checks: the checks either reject cases
+ // we handle specially (indirect args are forwarded via incoming pointers,
+ // stack-passed args reuse the matching incoming layout, sret is forwarded
+ // like any other pointer arg) or are optimizations not applicable to
+ // mandatory tail calls.
+ if (IsMustTail)
+ return true;
+
// Do not tail call opt if the stack is used to pass parameters.
if (CCInfo.getStackSize() != 0)
return false;
@@ -8351,11 +8393,6 @@ bool LoongArchTargetLowering::isEligibleForTailCallOptimization(
if (IsCallerStructRet || IsCalleeStructRet)
return false;
- // Do not tail call opt if either the callee or caller has a byval argument.
- for (auto &Arg : Outs)
- if (Arg.Flags.isByVal())
- return false;
-
// The callee has to preserve all registers the caller needs to preserve.
const LoongArchRegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
@@ -8488,47 +8525,149 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Promote the value if needed.
// For now, only handle fully promoted and indirect arguments.
if (VA.getLocInfo() == CCValAssign::Indirect) {
- // Store the argument in a stack slot and pass its address.
- Align StackAlign =
- std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG),
- getPrefTypeAlign(ArgValue.getValueType(), DAG));
- TypeSize StoredSize = ArgValue.getValueType().getStoreSize();
- // If the original argument was split and passed by reference, we need to
- // store the required parts of it here (and pass just one address).
- unsigned ArgIndex = Outs[OutIdx].OrigArgIndex;
- unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
- assert(ArgPartOffset == 0);
- // Calculate the total size to store. We don't have access to what we're
- // actually storing other than performing the loop and collecting the
- // info.
- SmallVector<std::pair<SDValue, SDValue>> Parts;
- while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) {
- SDValue PartValue = OutVals[OutIdx + 1];
- unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
- SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
- EVT PartVT = PartValue.getValueType();
+ // For musttail calls, reuse incoming indirect pointers instead of
+ // creating new stack temporaries. The incoming pointers point to the
+ // caller's caller's frame, which remains valid after a tail call.
+ if (IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
+ LoongArchMachineFunctionInfo *LAFI =
+ MF.getInfo<LoongArchMachineFunctionInfo>();
+ unsigned CallArgIdx = Outs[OutIdx].OrigArgIndex;
+
+ // Resolve which formal parameter is being passed at this call
+ // position.
+ //
+ // FIXME: Ins[].OrigArgIndex is Argument::getArgNo() (unfiltered),
+ // but Outs[].OrigArgIndex is an index into a filtered arg list
+ // (empty types removed, via CallLoweringInfo in the target-
+ // independent layer). IncomingIndirectArgs is keyed by the
+ // caller's unfiltered Argument::getArgNo(), so we have to walk
+ // the caller's formals (same filter) to translate the index.
+ // This target-independent asymmetry should be normalized so
+ // backends do not need to re-derive the mapping.
+ //
+ // Steps:
+ // 1. Find the call operand at filtered position CallArgIdx.
+ // 2. If it is an Argument, use getArgNo() directly (same filter
+ // for caller formals and call operands).
+ // 3. Otherwise (computed value), walk the caller's formals and
+ // skip empty types to map the filtered index to getArgNo().
+ const Argument *FormalArg = nullptr;
+ unsigned FilteredIdx = 0;
+ for (const auto &CallArg : CLI.CB->args()) {
+ if (CallArg->getType()->isEmptyTy())
+ continue;
+ if (FilteredIdx == CallArgIdx) {
+ FormalArg = dyn_cast<Argument>(CallArg);
+ break;
+ }
+ ++FilteredIdx;
+ }
- StoredSize += PartVT.getStoreSize();
- StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG));
- Parts.push_back(std::make_pair(PartValue, Offset));
- ++i;
- ++OutIdx;
- }
- SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign);
- int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
- MemOpChains.push_back(
- DAG.getStore(Chain, DL, ArgValue, SpillSlot,
- MachinePointerInfo::getFixedStack(MF, FI)));
- for (const auto &Part : Parts) {
- SDValue PartValue = Part.first;
- SDValue PartOffset = Part.second;
- SDValue Address =
- DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset);
+ // For forwarded args, getArgNo() gives the unfiltered index directly.
+ // For computed args, walk the caller's formals to resolve it.
+ unsigned FormalArgIdx = CallArgIdx;
+ if (FormalArg) {
+ FormalArgIdx = FormalArg->getArgNo();
+ } else {
+ FilteredIdx = 0;
+ for (const auto &Arg : MF.getFunction().args()) {
+ if (Arg.getType()->isEmptyTy())
+ continue;
+ if (FilteredIdx == CallArgIdx) {
+ FormalArgIdx = Arg.getArgNo();
+ break;
+ }
+ ++FilteredIdx;
+ }
+ }
+
+ Register VReg = LAFI->getIncomingIndirectArg(FormalArgIdx);
+ SDValue CopyOp = DAG.getCopyFromReg(Chain, DL, VReg, PtrVT);
+ // Thread the CopyFromReg output chain through MemOpChains so the
+ // TokenFactor below sequences the copy with any stores we emit
+ // for this argument.
+ MemOpChains.push_back(CopyOp.getValue(1));
+ SDValue IncomingPtr = CopyOp;
+
+ if (!FormalArg) {
+ // Computed value: store into the incoming indirect pointer for the
+ // same-position formal parameter (musttail guarantees matching
+ // prototypes, so types match). The pointer survives the tail call
+ // since it points to the caller's caller's frame.
+ //
+ // The data-flow edge through IncomingPtr already prevents the
+ // store from being scheduled before the CopyFromReg. Threading
+ // CopyOp.getValue(1) (the copy's output chain) into the store
+ // makes that ordering explicit on the chain edge as well, which
+ // is the convention for memory ops chaining off their producers.
+ MemOpChains.push_back(
+ DAG.getStore(CopyOp.getValue(1), DL, ArgValue, IncomingPtr,
+ MachinePointerInfo::getUnknownStack(MF)));
+ // Store any split parts at their respective offsets.
+ unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
+ while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) {
+ SDValue PartValue = OutVals[OutIdx + 1];
+ unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
+ SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
+ SDValue Addr =
+ DAG.getNode(ISD::ADD, DL, PtrVT, IncomingPtr, Offset);
+ MemOpChains.push_back(
+ DAG.getStore(CopyOp.getValue(1), DL, PartValue, Addr,
+ MachinePointerInfo::getUnknownStack(MF)));
+ ++i;
+ ++OutIdx;
+ }
+ }
+ ArgValue = IncomingPtr;
+
+ // Skip any remaining split parts (for forwarded args, they are
+ // covered by the forwarded pointer).
+ while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) {
+ ++i;
+ ++OutIdx;
+ }
+ } else {
+ // Store the argument in a stack slot and pass its address.
+ Align StackAlign =
+ std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG),
+ getPrefTypeAlign(ArgValue.getValueType(), DAG));
+ TypeSize StoredSize = ArgValue.getValueType().getStoreSize();
+ // If the original argument was split and passed by reference, we need
+ // to store the required parts of it here (and pass just one address).
+ unsigned ArgIndex = Outs[OutIdx].OrigArgIndex;
+ unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
+ assert(ArgPartOffset == 0);
+ // Calculate the total size to store. We don't have access to what we're
+ // actually storing other than performing the loop and collecting the
+ // info.
+ SmallVector<std::pair<SDValue, SDValue>> Parts;
+ while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) {
+ SDValue PartValue = OutVals[OutIdx + 1];
+ unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
+ SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
+ EVT PartVT = PartValue.getValueType();
+ StoredSize += PartVT.getStoreSize();
+ StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG));
+ Parts.push_back(std::make_pair(PartValue, Offset));
+ ++i;
+ ++OutIdx;
+ }
+ SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign);
+ int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
MemOpChains.push_back(
- DAG.getStore(Chain, DL, PartValue, Address,
+ DAG.getStore(Chain, DL, ArgValue, SpillSlot,
MachinePointerInfo::getFixedStack(MF, FI)));
+ for (const auto &Part : Parts) {
+ SDValue PartValue = Part.first;
+ SDValue PartOffset = Part.second;
+ SDValue Address =
+ DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset);
+ MemOpChains.push_back(
+ DAG.getStore(Chain, DL, PartValue, Address,
+ MachinePointerInfo::getFixedStack(MF, FI)));
+ }
+ ArgValue = SpillSlot;
}
- ArgValue = SpillSlot;
} else {
ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL);
}
@@ -8542,8 +8681,8 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
} else {
assert(VA.isMemLoc() && "Argument not register or memory");
- assert(!IsTailCall && "Tail call not allowed if stack is used "
- "for passing parameters");
+ assert((!IsTailCall || (CLI.CB && CLI.CB->isMustTailCall())) &&
+ "Tail call not allowed if stack is used for passing parameters");
// Work out the address of the stack slot.
if (!StackPtr.getNode())
diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
index 904985c189dba..7bf7171198e8a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
@@ -14,6 +14,7 @@
#define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHMACHINEFUNCTIONINFO_H
#include "LoongArchSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -32,6 +33,13 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo {
/// Size of stack frame to save callee saved registers
unsigned CalleeSavedStackSize = 0;
+ /// Incoming indirect argument pointers saved as virtual registers, keyed by
+ /// formal parameter index. Used for musttail forwarding of indirect args.
+ /// Virtual registers (not SDValues) are used because the SelectionDAG is
+ /// cleared between basic blocks, and musttail calls may be in non-entry
+ /// blocks.
+ DenseMap<unsigned, Register> IncomingIndirectArgs;
+
/// FrameIndex of the spill slot when there is no scavenged register in
/// insertIndirectBranch.
int BranchRelaxationSpillFrameIndex = -1;
@@ -63,6 +71,15 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo {
unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
+ void setIncomingIndirectArg(unsigned ArgIndex, Register Reg) {
+ IncomingIndirectArgs[ArgIndex] = Reg;
+ }
+ Register getIncomingIndirectArg(unsigned ArgIndex) const {
+ auto It = IncomingIndirectArgs.find(ArgIndex);
+ assert(It != IncomingIndirectArgs.end() && "No incoming indirect arg");
+ return It->second;
+ }
+
int getBranchRelaxationSpillFrameIndex() {
return BranchRelaxationSpillFrameIndex;
}
diff --git a/llvm/test/CodeGen/LoongArch/musttail-call.ll b/llvm/test/CodeGen/LoongArch/musttail-call.ll
new file mode 100644
index 0000000000000..0fe77ed802b7a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/musttail-call.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=loongarch32 %s -o - | FileCheck %s --check-prefix=LA32
+; RUN: llc -mtriple=loongarch64 %s -o - | FileCheck %s --check-prefix=LA64
+
+%struct.A = type { i32 }
+
+declare void @callee_musttail(ptr sret(%struct.A) %a)
+define void @caller_musttail(ptr sret(%struct.A) %a) {
+; LA32-LABEL: caller_musttail:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: b callee_musttail
+;
+; LA64-LABEL: caller_musttail:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: pcaddu18i $t8, %call36(callee_musttail)
+; LA64-NEXT: jr $t8
+entry:
+ musttail call void @callee_musttail(ptr sret(%struct.A) %a)
+ ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll b/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll
new file mode 100644
index 0000000000000..d088d6065aa07
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll
@@ -0,0 +1,907 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=loongarch32 %s -o - | FileCheck %s --check-prefix=LA32
+; RUN: llc -mtriple=loongarch64 %s -o - | FileCheck %s --check-prefix=LA64
+
+; Test that musttail with indirect args (fp128 on LA32) forwards the incoming
+; pointer instead of creating a new stack temporary. Without this fix, the
+; pointer would dangle after the tail call deallocates the caller's frame.
+
+declare i32 @callee_musttail_indirect(fp128 %a)
+
+; fp128 is indirect on LA32 (too large for registers), direct on LA64.
+; On LA32, musttail must forward the incoming indirect pointer (a0) directly.
+define i32 @caller_musttail_indirect(fp128 %a) nounwind {
+; LA32-LABEL: caller_musttail_indirect:
+; LA32: # %bb.0:
+; LA32-NEXT: b callee_musttail_indirect
+;
+; LA64-LABEL: caller_musttail_indirect:
+; LA64: # %bb.0:
+; LA64-NEXT: pcaddu18i $t8, %call36(callee_musttail_indirect)
+; LA64-NEXT: jr $t8
+ %call = musttail call i32 @callee_musttail_indirect(fp128 %a)
+ ret i32 %call
+}
+
+; Verify that non-musttail tail call with indirect args does NOT tail call
+; (this is the PR #184972 fix - indirect args are unsafe for regular tail calls).
+define void @caller_no_musttail_indirect() nounwind {
+; LA32-LABEL: caller_no_musttail_indirect:
+; LA32: # %bb.0:
+; LA32-NEXT: addi.w $sp, $sp, -32
+; LA32-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: lu12i.w $a0, 262128
+; LA32-NEXT: st.w $a0, $sp, 12
+; LA32-NEXT: st.w $zero, $sp, 8
+; LA32-NEXT: st.w $zero, $sp, 4
+; LA32-NEXT: addi.w $a0, $sp, 0
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bl callee_musttail_indirect
+; LA32-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 32
+; LA32-NEXT: ret
+;
+; LA64-LABEL: caller_no_musttail_indirect:
+; LA64: # %bb.0:
+; LA64-NEXT: ori $a0, $zero, 0
+; LA64-NEXT: lu32i.d $a0, -65536
+; LA64-NEXT: lu52i.d $a1, $a0, 1023
+; LA64-NEXT: move $a0, $zero
+; LA64-NEXT: pcaddu18i $t8, %call36(callee_musttail_indirect)
+; LA64-NEXT: jr $t8
+ %call = tail call i32 @callee_musttail_indirect(fp128 0xL00000000000000003FFF000000000000)
+ ret void
+}
+
+; Verify that non-musttail tail call forwarding an indirect arg from the
+; caller's own parameters also does NOT tail call (the arg lives on the
+; caller's frame, which would be deallocated).
+define i32 @caller_no_musttail_forward_indirect(fp128 %a) nounwind {
+; LA32-LABEL: caller_no_musttail_forward_indirect:
+; LA32: # %bb.0:
+; LA32-NEXT: addi.w $sp, $sp, -32
+; LA32-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: ld.w $a1, $a0, 0
+; LA32-NEXT: ld.w $a2, $a0, 4
+; LA32-NEXT: ld.w $a3, $a0, 8
+; LA32-NEXT: ld.w $a0, $a0, 12
+; LA32-NEXT: st.w $a0, $sp, 12
+; LA32-NEXT: st.w $a3, $sp, 8
+; LA32-NEXT: st.w $a2, $sp, 4
+; LA32-NEXT: addi.w $a0, $sp, 0
+; LA32-NEXT: st.w $a1, $sp, 0
+; LA32-NEXT: bl callee_musttail_indirect
+; LA32-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 32
+; LA32-NEXT: ret
+;
+; LA64-LABEL: caller_no_musttail_forward_indirect:
+; LA64: # %bb.0:
+; LA64-NEXT: pcaddu18i $t8, %call36(callee_musttail_indirect)
+; LA64-NEXT: jr $t8
+ %call = tail call i32 @callee_musttail_indirect(fp128 %a)
+ ret i32 %call
+}
+
+; Test musttail with two indirect fp128 args on LA32. Both pointers must be
+; forwarded. Exercises the D...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/199637
More information about the llvm-branch-commits
mailing list