[llvm] [AArch64] Skip storing of stack arguments when lowering tail calls (PR #126735)
Guy David via llvm-commits
llvm-commits at lists.llvm.org
Fri May 16 15:21:01 PDT 2025
https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/126735
>From e7d3666de31381b4b8fe08f5e55bda24659d9aa5 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Tue, 11 Feb 2025 14:15:02 +0200
Subject: [PATCH] [AArch64] Skip storing of stack arguments when lowering tail
calls
When possible, do not emit trivial load and stores to the same offset on
the stack.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 47 +++++++-
.../AArch64/GISel/AArch64CallLowering.cpp | 54 ++++++++++
llvm/test/CodeGen/AArch64/darwinpcs-tail.ll | 4 +-
.../CodeGen/AArch64/scavenge-large-call.ll | 2 +-
.../sve-fixed-length-frame-offests-crash.ll | 102 +++++++++---------
.../CodeGen/AArch64/tail-call-stack-args.ll | 15 +++
6 files changed, 167 insertions(+), 57 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/tail-call-stack-args.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 293292d47dd48..fb49773f60221 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8968,6 +8968,46 @@ getSMToggleCondition(const SMECallAttrs &CallAttrs) {
llvm_unreachable("Unsupported attributes");
}
+/// Check whether a stack argument requires lowering in a tail call.
+static bool shouldLowerTailCallStackArg(const MachineFunction &MF,
+ const CCValAssign &VA, SDValue Arg,
+ ISD::ArgFlagsTy Flags, int CallOffset) {
+ // FIXME: We should be able to handle this case, but it's not clear how to.
+ if (Flags.isZExt() || Flags.isSExt())
+ return true;
+
+ for (;;) {
+ // Look through nodes that don't alter the bits of the incoming value.
+ unsigned Op = Arg.getOpcode();
+ if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
+ Op == ISD::AssertZext || Op == ISD::AssertSext ||
+ Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
+ Arg = Arg.getOperand(0);
+ continue;
+ }
+ break;
+ }
+
+ // If the argument is a load from the same immutable stack slot, we can reuse
+ // it.
+ if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
+ if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ int FI = FINode->getIndex();
+ if (!MFI.isImmutableObjectIndex(FI))
+ return true;
+ if (CallOffset != MFI.getObjectOffset(FI))
+ return true;
+ uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
+ if (SizeInBits / 8 != MFI.getObjectSize(FI))
+ return true;
+ return false;
+ }
+ }
+
+ return true;
+}
+
/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
/// and add input and output parameter nodes.
SDValue
@@ -9391,10 +9431,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}
unsigned LocMemOffset = VA.getLocMemOffset();
int32_t Offset = LocMemOffset + BEAlign;
- SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
- PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
if (IsTailCall) {
+ // When the frame pointer is perfectly aligned for the tail call and the
+ // same stack argument is passed down intact, we can reuse it.
+ if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
+ continue;
+
Offset = Offset + FPDiff;
int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 9bef102e8abf1..b6d989fa85199 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -25,6 +25,7 @@
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/LowLevelTypeUtils.h"
@@ -35,6 +36,7 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
@@ -44,6 +46,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -296,10 +299,61 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
+ /// Check whether a stack argument requires lowering in a tail call.
+ static bool shouldLowerTailCallStackArg(const MachineFunction &MF,
+ const CCValAssign &VA,
+ Register ValVReg,
+ Register StoreAddr) {
+ // Print the defining instruction for the value.
+ auto *DefMI = MF.getRegInfo().getVRegDef(ValVReg);
+ assert(DefMI && "No defining instruction");
+ for (;;) {
+ // Look through nodes that don't alter the bits of the incoming value.
+ unsigned Op = DefMI->getOpcode();
+ if (Op == TargetOpcode::G_ZEXT || Op == TargetOpcode::G_ANYEXT ||
+ Op == TargetOpcode::G_TRUNC || Op == TargetOpcode::G_BITCAST ||
+ Op == TargetOpcode::G_ASSERT_ZEXT ||
+ Op == TargetOpcode::G_ASSERT_SEXT) {
+ DefMI = MF.getRegInfo().getVRegDef(DefMI->getOperand(1).getReg());
+ continue;
+ }
+ break;
+ }
+
+ auto *Load = dyn_cast<GLoad>(DefMI);
+ if (!Load)
+ return true;
+ Register LoadReg = Load->getPointerReg();
+ auto *LoadAddrDef = MF.getRegInfo().getVRegDef(LoadReg);
+ assert(LoadAddrDef && "No defining instruction");
+ if (LoadAddrDef->getOpcode() != TargetOpcode::G_FRAME_INDEX)
+ return true;
+ assert(LoadAddrDef && "No defining instruction");
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ int LoadFI = LoadAddrDef->getOperand(1).getIndex();
+
+ auto *StoreAddrDef = MF.getRegInfo().getVRegDef(StoreAddr);
+ assert(StoreAddrDef && "No defining instruction");
+ if (StoreAddrDef->getOpcode() != TargetOpcode::G_FRAME_INDEX)
+ return true;
+ int StoreFI = StoreAddrDef->getOperand(1).getIndex();
+
+ if (!MFI.isImmutableObjectIndex(LoadFI))
+ return true;
+ if (MFI.getObjectOffset(LoadFI) != MFI.getObjectOffset(StoreFI))
+ return true;
+ if (Load->getMemSize() != MFI.getObjectSize(StoreFI))
+ return true;
+
+ return false;
+ }
+
void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
const MachinePointerInfo &MPO,
const CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
+ if (!shouldLowerTailCallStackArg(MF, VA, ValVReg, Addr))
+ return;
auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, MemTy,
inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildStore(ValVReg, Addr, *MMO);
diff --git a/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll b/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll
index 5d3c755d0d73d..61c25edded2ac 100644
--- a/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll
+++ b/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll
@@ -5,11 +5,11 @@
; CHECK-LABEL: __ZThn16_N1C3addEPKcz:
; CHECK: b __ZN1C3addEPKcz
+
; CHECK-LABEL: _tailTest:
; CHECK: b __ZN1C3addEPKcz
+
; CHECK-LABEL: __ZThn8_N1C1fEiiiiiiiiiz:
-; CHECK: ldr w9, [sp, #4]
-; CHECK: str w9, [sp, #4]
; CHECK: b __ZN1C1fEiiiiiiiiiz
%class.C = type { %class.A.base, [4 x i8], %class.B.base, [4 x i8] }
diff --git a/llvm/test/CodeGen/AArch64/scavenge-large-call.ll b/llvm/test/CodeGen/AArch64/scavenge-large-call.ll
index 0c9bdd098aa2a..0cbdd087a5b96 100644
--- a/llvm/test/CodeGen/AArch64/scavenge-large-call.ll
+++ b/llvm/test/CodeGen/AArch64/scavenge-large-call.ll
@@ -4,7 +4,7 @@
; CHECK: add {{x[0-9]+}}, sp,
define void @caller(ptr %0, i16 %1, i16 %2, i8 %3, double %4, i16 %5, i8 %6, ptr %7, double %8, i32 %9, ptr %10, double %11, double %12, [2 x i64] %13, [2 x i64] %14, [2 x i64] %15, double %16, double %17, [2 x i64] %18, [2 x i64] %19, i16 %20, i32 %21, double %22, i8 %23, [2 x i64] %24, [2 x i64] %25, [2 x i64] %26, i8 %27, i16 %28, i16 %29, i16 %30, i32 %31, [2 x i64] %32, [2 x i64] %33, [2 x i64] %34, [2 x i64] %35, [2 x i64] %36, i32 %37, i32 %38) {
- tail call void @callee(ptr %0, i16 %1, i16 %2, i8 %3, double 0.000000e+00, i16 %5, i8 %6, ptr %7, double 0.000000e+00, i32 %9, ptr %10, double 0.000000e+00, double 0.000000e+00, [2 x i64] %13, [2 x i64] %14, [2 x i64] %15, double 0.000000e+00, double 0.000000e+00, [2 x i64] %18, [2 x i64] %19, i16 %20, i32 %21, double 0.000000e+00, i8 %23, [2 x i64] %24, [2 x i64] %25, [2 x i64] zeroinitializer, i8 %27, i16 0, i16 0, i16 %28, i32 0, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] %35, [2 x i64] %36, i32 0, i32 0)
+ call void @callee(ptr %0, i16 %1, i16 %2, i8 %3, double 0.000000e+00, i16 %5, i8 %6, ptr %7, double 0.000000e+00, i32 %9, ptr %10, double 0.000000e+00, double 0.000000e+00, [2 x i64] %13, [2 x i64] %14, [2 x i64] %15, double 0.000000e+00, double 0.000000e+00, [2 x i64] %18, [2 x i64] %19, i16 %20, i32 %21, double 0.000000e+00, i8 %23, [2 x i64] %24, [2 x i64] %25, [2 x i64] zeroinitializer, i8 %27, i16 0, i16 0, i16 %28, i32 0, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] %35, [2 x i64] %36, i32 0, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
index 72686c3f418e7..c3fca4c18ee70 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
@@ -11,66 +11,64 @@ target triple = "aarch64-unknown-linux-gnu"
define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8,
; CHECK-LABEL: func1:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 48
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w20, -16
-; CHECK-NEXT: .cfi_offset w21, -24
-; CHECK-NEXT: .cfi_offset w22, -32
-; CHECK-NEXT: .cfi_offset w29, -48
-; CHECK-NEXT: add x10, sp, #176
-; CHECK-NEXT: add x8, sp, #48
-; CHECK-NEXT: add x9, sp, #144
-; CHECK-NEXT: ldr z3, [x10]
+; CHECK-NEXT: sub sp, sp, #368
+; CHECK-NEXT: stp x29, x30, [sp, #336] // 16-byte Folded Spill
+; CHECK-NEXT: str x28, [sp, #352] // 8-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #336
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w28, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: add x8, x29, #32
+; CHECK-NEXT: add x9, x29, #72
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ldr z0, [x8]
-; CHECK-NEXT: add x8, sp, #112
-; CHECK-NEXT: ldr z2, [x9]
+; CHECK-NEXT: add x8, x29, #256
+; CHECK-NEXT: ldr z3, [x9]
; CHECK-NEXT: ldr z1, [x8]
-; CHECK-NEXT: add x20, sp, #176
-; CHECK-NEXT: ldp x9, x8, [sp, #328]
-; CHECK-NEXT: ldr x15, [sp, #104]
-; CHECK-NEXT: ldp x11, x10, [sp, #312]
-; CHECK-NEXT: ldur q4, [sp, #88]
-; CHECK-NEXT: ldp x13, x12, [sp, #296]
-; CHECK-NEXT: ldr x19, [sp, #272]
-; CHECK-NEXT: ldp x18, x14, [sp, #280]
-; CHECK-NEXT: ldp x16, x17, [sp, #208]
-; CHECK-NEXT: ldp x21, x22, [sp, #352]
-; CHECK-NEXT: str z3, [x20]
-; CHECK-NEXT: add x20, sp, #144
-; CHECK-NEXT: str z2, [x20]
-; CHECK-NEXT: add x20, sp, #112
-; CHECK-NEXT: str z1, [x20]
-; CHECK-NEXT: add x20, sp, #48
-; CHECK-NEXT: str z0, [x20]
-; CHECK-NEXT: stp x21, x22, [sp, #352]
-; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: stp x19, x18, [sp, #272]
-; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: stp x16, x17, [sp, #208]
-; CHECK-NEXT: stur q4, [sp, #88]
-; CHECK-NEXT: str x15, [sp, #104]
-; CHECK-NEXT: stp x14, x13, [sp, #288]
-; CHECK-NEXT: stp x12, x11, [sp, #304]
-; CHECK-NEXT: stp x10, x9, [sp, #320]
-; CHECK-NEXT: str x8, [sp, #336]
-; CHECK-NEXT: ldr x29, [sp], #48 // 8-byte Folded Reload
-; CHECK-NEXT: b func2
+; CHECK-NEXT: add x8, x29, #288
+; CHECK-NEXT: add x9, x29, #168
+; CHECK-NEXT: ldr z2, [x8]
+; CHECK-NEXT: add x8, x29, #104
+; CHECK-NEXT: ldr z6, [x9]
+; CHECK-NEXT: ldr z4, [x8]
+; CHECK-NEXT: add x8, x29, #136
+; CHECK-NEXT: mov x12, #17 // =0x11
+; CHECK-NEXT: ldr z5, [x8]
+; CHECK-NEXT: ldp x10, x11, [x29, #336]
+; CHECK-NEXT: st1d { z6.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT: mov x12, #13 // =0xd
+; CHECK-NEXT: ldr x8, [x29, #200]
+; CHECK-NEXT: ldr x9, [x29, #320]
+; CHECK-NEXT: st1d { z5.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT: mov x12, #9 // =0x9
+; CHECK-NEXT: st1d { z4.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT: mov x12, #5 // =0x5
+; CHECK-NEXT: st1d { z3.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT: stp x10, x11, [sp, #304]
+; CHECK-NEXT: str x9, [sp, #288]
+; CHECK-NEXT: str z2, [sp, #8, mul vl]
+; CHECK-NEXT: str z1, [sp, #7, mul vl]
+; CHECK-NEXT: str x8, [sp, #168]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: bl func2
+; CHECK-NEXT: ldp x29, x30, [sp, #336] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x28, [sp, #352] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #368
+; CHECK-NEXT: ret
ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr %v13, ptr %v14, ptr %v15, ptr %v16,
ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24,
ptr %v25, ptr %v26, ptr %v27, ptr %v28, ptr %v29, ptr %v30, ptr %v31, ptr %v32,
ptr %v33, ptr %v34, ptr %v35, ptr %v36, ptr %v37, ptr %v38, ptr %v39, ptr %v40,
ptr %v41, ptr %v42, ptr %v43, ptr %v44, ptr %v45, ptr %v46, ptr %v47, ptr %v48,
i64 %v49) #0 {
- tail call void @func2(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8,
- ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr undef, ptr %v14, ptr %v15, ptr %v16,
- ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24,
- ptr %v25, ptr %v26, ptr %v27, ptr %v28, ptr %v29, ptr %v30, ptr undef, ptr undef,
- ptr undef, ptr undef, ptr undef, ptr undef, ptr %v37, ptr %v38, ptr %v39, ptr %v40,
- ptr %v41, ptr %v42, ptr %v43, ptr %v44, ptr %v45, ptr undef, ptr %v47, ptr %v48,
- i64 undef)
+ call void @func2(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8,
+ ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr undef, ptr %v14, ptr %v15, ptr %v16,
+ ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24,
+ ptr %v25, ptr %v26, ptr %v27, ptr %v28, ptr %v29, ptr %v30, ptr undef, ptr undef,
+ ptr undef, ptr undef, ptr undef, ptr undef, ptr %v37, ptr %v38, ptr %v39, ptr %v40,
+ ptr %v41, ptr %v42, ptr %v43, ptr %v44, ptr %v45, ptr undef, ptr %v47, ptr %v48,
+ i64 undef)
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/tail-call-stack-args.ll b/llvm/test/CodeGen/AArch64/tail-call-stack-args.ll
new file mode 100644
index 0000000000000..a1793fa9e151d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/tail-call-stack-args.ll
@@ -0,0 +1,15 @@
+; RUN: llc %s -mtriple=aarch64 -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64 -global-isel -o - | FileCheck %s
+
+; Tail calls which have stack arguments in the same offsets as the caller do not
+; need to load and store the arguments from the stack.
+
+declare void @func_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 %j)
+
+; CHECK-LABEL: wrapper_func_i1:
+define void @wrapper_func_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 %j) {
+ ; CHECK: // %bb.
+ ; CHECK-NEXT: b func_i1
+ tail call void @func_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 %j)
+ ret void
+}
More information about the llvm-commits
mailing list