[llvm] [AArch64] Skip storing of stack arguments when lowering tail calls (PR #126735)

Guy David via llvm-commits llvm-commits at lists.llvm.org
Sun Feb 16 15:59:38 PST 2025


https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/126735

>From c2346860a70cfad9686ad0ac6af108734675cda7 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Tue, 11 Feb 2025 14:15:02 +0200
Subject: [PATCH 1/2] [AArch64] Skip storing of stack arguments when lowering
 tail calls

When possible, do not emit trivial load and stores to the same offset on
the stack.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  47 +++++++-
 llvm/test/CodeGen/AArch64/darwinpcs-tail.ll   |   4 +-
 .../CodeGen/AArch64/scavenge-large-call.ll    |   2 +-
 .../sve-fixed-length-frame-offests-crash.ll   | 103 +++++++++---------
 .../CodeGen/AArch64/tail-call-stack-args.ll   |  59 ++++++++++
 5 files changed, 158 insertions(+), 57 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/tail-call-stack-args.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4263be1098899..e03cdc1596957 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8900,6 +8900,46 @@ static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
   llvm_unreachable("Unsupported attributes");
 }
 
+/// Check whether a stack argument requires lowering in a tail call.
+static bool shouldLowerTailCallStackArg(const MachineFunction &MF,
+                                        const CCValAssign &VA, SDValue Arg,
+                                        ISD::ArgFlagsTy Flags, int CallOffset) {
+  // FIXME: We should be able to handle this case, but it's not clear how to.
+  if (Flags.isZExt() || Flags.isSExt())
+    return true;
+
+  for (;;) {
+    // Look through nodes that don't alter the bits of the incoming value.
+    unsigned Op = Arg.getOpcode();
+    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
+        Op == ISD::AssertZext || Op == ISD::AssertSext ||
+        Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
+      Arg = Arg.getOperand(0);
+      continue;
+    }
+    break;
+  }
+
+  // If the argument is a load from the same immutable stack slot, we can reuse
+  // it.
+  if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
+    if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
+      const MachineFrameInfo &MFI = MF.getFrameInfo();
+      int FI = FINode->getIndex();
+      if (!MFI.isImmutableObjectIndex(FI))
+        return true;
+      if (CallOffset != MFI.getObjectOffset(FI))
+        return true;
+      uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
+      if (SizeInBits / 8 != MFI.getObjectSize(FI))
+        return true;
+      return false;
+    }
+  }
+
+  return true;
+}
+
 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
 /// and add input and output parameter nodes.
 SDValue
@@ -9328,10 +9368,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       }
       unsigned LocMemOffset = VA.getLocMemOffset();
       int32_t Offset = LocMemOffset + BEAlign;
-      SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
-      PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
 
       if (IsTailCall) {
+        // When the frame pointer is perfectly aligned for the tail call and the
+        // same stack argument is passed down intact, we can reuse it.
+        if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
+          continue;
+
         Offset = Offset + FPDiff;
         int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
 
diff --git a/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll b/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll
index 5d3c755d0d73d..61c25edded2ac 100644
--- a/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll
+++ b/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll
@@ -5,11 +5,11 @@
 
 ; CHECK-LABEL: __ZThn16_N1C3addEPKcz:
 ; CHECK:       b __ZN1C3addEPKcz
+
 ; CHECK-LABEL: _tailTest:
 ; CHECK:       b __ZN1C3addEPKcz
+
 ; CHECK-LABEL: __ZThn8_N1C1fEiiiiiiiiiz:
-; CHECK:       ldr     w9, [sp, #4]
-; CHECK:       str     w9, [sp, #4]
 ; CHECK:       b __ZN1C1fEiiiiiiiiiz
 
 %class.C = type { %class.A.base, [4 x i8], %class.B.base, [4 x i8] }
diff --git a/llvm/test/CodeGen/AArch64/scavenge-large-call.ll b/llvm/test/CodeGen/AArch64/scavenge-large-call.ll
index 0c9bdd098aa2a..0cbdd087a5b96 100644
--- a/llvm/test/CodeGen/AArch64/scavenge-large-call.ll
+++ b/llvm/test/CodeGen/AArch64/scavenge-large-call.ll
@@ -4,7 +4,7 @@
 ; CHECK: add {{x[0-9]+}}, sp,
 
 define void @caller(ptr %0, i16 %1, i16 %2, i8 %3, double %4, i16 %5, i8 %6, ptr %7, double %8, i32 %9, ptr %10, double %11, double %12, [2 x i64] %13, [2 x i64] %14, [2 x i64] %15, double %16, double %17, [2 x i64] %18, [2 x i64] %19, i16 %20, i32 %21, double %22, i8 %23, [2 x i64] %24, [2 x i64] %25, [2 x i64] %26, i8 %27, i16 %28, i16 %29, i16 %30, i32 %31, [2 x i64] %32, [2 x i64] %33, [2 x i64] %34, [2 x i64] %35, [2 x i64] %36, i32 %37, i32 %38) {
-  tail call void @callee(ptr %0, i16 %1, i16 %2, i8 %3, double 0.000000e+00, i16 %5, i8 %6, ptr %7, double 0.000000e+00, i32 %9, ptr %10, double 0.000000e+00, double 0.000000e+00, [2 x i64] %13, [2 x i64] %14, [2 x i64] %15, double 0.000000e+00, double 0.000000e+00, [2 x i64] %18, [2 x i64] %19, i16 %20, i32 %21, double 0.000000e+00, i8 %23, [2 x i64] %24, [2 x i64] %25, [2 x i64] zeroinitializer, i8 %27, i16 0, i16 0, i16 %28, i32 0, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] %35, [2 x i64] %36, i32 0, i32 0)
+  call void @callee(ptr %0, i16 %1, i16 %2, i8 %3, double 0.000000e+00, i16 %5, i8 %6, ptr %7, double 0.000000e+00, i32 %9, ptr %10, double 0.000000e+00, double 0.000000e+00, [2 x i64] %13, [2 x i64] %14, [2 x i64] %15, double 0.000000e+00, double 0.000000e+00, [2 x i64] %18, [2 x i64] %19, i16 %20, i32 %21, double 0.000000e+00, i8 %23, [2 x i64] %24, [2 x i64] %25, [2 x i64] zeroinitializer, i8 %27, i16 0, i16 0, i16 %28, i32 0, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] zeroinitializer, [2 x i64] %35, [2 x i64] %36, i32 0, i32 0)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
index 1bd688d23050b..c4d886336cf68 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
@@ -11,67 +11,66 @@ target triple = "aarch64-unknown-linux-gnu"
 define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8,
 ; CHECK-LABEL: func1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w29, -48
+; CHECK-NEXT:    sub sp, sp, #368
+; CHECK-NEXT:    stp x29, x30, [sp, #336] // 16-byte Folded Spill
+; CHECK-NEXT:    str x28, [sp, #352] // 8-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #336
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w28, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    add x10, sp, #176
-; CHECK-NEXT:    add x8, sp, #48
-; CHECK-NEXT:    add x9, sp, #144
-; CHECK-NEXT:    add x20, sp, #176
-; CHECK-NEXT:    ldr x15, [sp, #104]
-; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x10]
+; CHECK-NEXT:    add x8, x29, #32
+; CHECK-NEXT:    add x9, x29, #136
+; CHECK-NEXT:    mov x12, #32 // =0x20
+; CHECK-NEXT:    ldp x10, x11, [x29, #336]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
-; CHECK-NEXT:    add x8, sp, #112
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x9]
+; CHECK-NEXT:    add x8, x29, #72
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x9]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x8]
-; CHECK-NEXT:    ldur q4, [sp, #88]
-; CHECK-NEXT:    ldp x9, x8, [sp, #328]
-; CHECK-NEXT:    ldr x19, [sp, #272]
-; CHECK-NEXT:    ldp x11, x10, [sp, #312]
-; CHECK-NEXT:    ldp x13, x12, [sp, #296]
-; CHECK-NEXT:    ldp x18, x14, [sp, #280]
-; CHECK-NEXT:    ldp x16, x17, [sp, #208]
-; CHECK-NEXT:    ldp x21, x22, [sp, #352]
-; CHECK-NEXT:    st1d { z3.d }, p0, [x20]
-; CHECK-NEXT:    add x20, sp, #144
-; CHECK-NEXT:    st1d { z2.d }, p0, [x20]
-; CHECK-NEXT:    add x20, sp, #112
-; CHECK-NEXT:    st1d { z1.d }, p0, [x20]
-; CHECK-NEXT:    add x20, sp, #48
-; CHECK-NEXT:    st1d { z0.d }, p0, [x20]
-; CHECK-NEXT:    stp x21, x22, [sp, #352]
-; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    stp x19, x18, [sp, #272]
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    stp x16, x17, [sp, #208]
-; CHECK-NEXT:    stur q4, [sp, #88]
-; CHECK-NEXT:    str x15, [sp, #104]
-; CHECK-NEXT:    stp x14, x13, [sp, #288]
-; CHECK-NEXT:    stp x12, x11, [sp, #304]
-; CHECK-NEXT:    stp x10, x9, [sp, #320]
-; CHECK-NEXT:    str x8, [sp, #336]
-; CHECK-NEXT:    ldr x29, [sp], #48 // 8-byte Folded Reload
-; CHECK-NEXT:    b func2
+; CHECK-NEXT:    add x8, x29, #104
+; CHECK-NEXT:    add x9, x29, #288
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x8]
+; CHECK-NEXT:    add x8, x29, #168
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x9]
+; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x8]
+; CHECK-NEXT:    add x8, x29, #256
+; CHECK-NEXT:    ldr x9, [x29, #320]
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x8]
+; CHECK-NEXT:    ldr x8, [x29, #200]
+; CHECK-NEXT:    st1d { z6.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT:    mov x12, #28 // =0x1c
+; CHECK-NEXT:    st1d { z5.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT:    mov x12, #17 // =0x11
+; CHECK-NEXT:    st1d { z4.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT:    mov x12, #13 // =0xd
+; CHECK-NEXT:    st1d { z3.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT:    mov x12, #9 // =0x9
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT:    mov x12, #5 // =0x5
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, x12, lsl #3]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    stp x10, x11, [sp, #304]
+; CHECK-NEXT:    str x9, [sp, #288]
+; CHECK-NEXT:    str x8, [sp, #168]
+; CHECK-NEXT:    bl func2
+; CHECK-NEXT:    ldp x29, x30, [sp, #336] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x28, [sp, #352] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #368
+; CHECK-NEXT:    ret
                              ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr %v13, ptr %v14,  ptr %v15, ptr %v16,
                              ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24,
                              ptr %v25, ptr %v26, ptr %v27, ptr %v28, ptr %v29, ptr %v30, ptr %v31, ptr %v32,
                              ptr %v33, ptr %v34, ptr %v35, ptr %v36, ptr %v37, ptr %v38, ptr %v39, ptr %v40,
                              ptr %v41, ptr %v42, ptr %v43, ptr %v44, ptr %v45, ptr %v46, ptr %v47, ptr %v48,
                              i64 %v49) #0 {
-  tail call void @func2(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8,
-                        ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr undef, ptr %v14, ptr %v15, ptr %v16,
-                        ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24,
-                        ptr %v25, ptr %v26, ptr %v27, ptr %v28, ptr %v29, ptr %v30, ptr undef, ptr undef,
-                        ptr undef, ptr undef, ptr undef, ptr undef, ptr %v37, ptr %v38, ptr %v39, ptr %v40,
-                        ptr %v41, ptr %v42, ptr %v43, ptr %v44, ptr %v45, ptr undef, ptr %v47, ptr %v48,
-                        i64 undef)
+  call void @func2(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8,
+                   ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr undef, ptr %v14, ptr %v15, ptr %v16,
+                   ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24,
+                   ptr %v25, ptr %v26, ptr %v27, ptr %v28, ptr %v29, ptr %v30, ptr undef, ptr undef,
+                   ptr undef, ptr undef, ptr undef, ptr undef, ptr %v37, ptr %v38, ptr %v39, ptr %v40,
+                   ptr %v41, ptr %v42, ptr %v43, ptr %v44, ptr %v45, ptr undef, ptr %v47, ptr %v48,
+                   i64 undef)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AArch64/tail-call-stack-args.ll b/llvm/test/CodeGen/AArch64/tail-call-stack-args.ll
new file mode 100644
index 0000000000000..e438fdb9d753e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/tail-call-stack-args.ll
@@ -0,0 +1,59 @@
+; RUN: llc %s -o - | FileCheck %s
+
+; Tail calls which have stack arguments in the same offsets as the caller do not
+; need to load and store the arguments from the stack.
+
+target triple = "aarch64"
+
+declare void @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j)
+
+; CHECK-LABEL: wrapper_func:
+define void @wrapper_func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j) {
+  ; CHECK: // %bb.
+  ; CHECK-NEXT: b func
+  tail call void @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j)
+  ret void
+}
+
+; CHECK-LABEL: wrapper_func_zero_arg:
+define void @wrapper_func_zero_arg(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j) {
+  ; CHECK: // %bb.
+  ; CHECK-NEXT: str wzr, [sp, #8]
+  ; CHECK-NEXT: b func
+  tail call void @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: wrapper_func_overriden_arg:
+define void @wrapper_func_overriden_arg(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j) {
+  ; CHECK: // %bb.
+  ; CHECK-NEXT: ldr w8, [sp]
+  ; CHECK-NEXT: str wzr, [sp]
+  ; CHECK-NEXT: str w8, [sp, #8]
+  ; CHECK-NEXT: b func
+  tail call void @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 0, i32 %i)
+  ret void
+}
+
+declare void @func_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 %j)
+
+; CHECK-LABEL: wrapper_func_i1:
+define void @wrapper_func_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 %j) {
+  ; CHECK: // %bb.
+  ; CHECK-NEXT: b func_i1
+  tail call void @func_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 %j)
+  ret void
+}
+
+declare void @func_signext_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 signext %j)
+
+; FIXME: Support zero/sign-extended stack arguments.
+; CHECK-LABEL: wrapper_func_i8:
+define void @wrapper_func_i8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 signext %j) {
+  ; CHECK: // %bb.
+  ; CHECK-NEXT: ldrsb w8, [sp, #8]
+  ; CHECK-NEXT: strb w8, [sp, #8]
+  ; CHECK-NEXT: b func_signext_i1
+  tail call void @func_signext_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 signext %j)
+  ret void
+}

>From 2e4596486093aa6475904bde3480ae204a3058f4 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Mon, 17 Feb 2025 01:20:30 +0200
Subject: [PATCH 2/2] [ARM] Skip storing of stack arguments when lowering tail
 calls

---
 llvm/lib/Target/ARM/ARMISelLowering.cpp       | 46 ++++++++++++++
 llvm/test/CodeGen/ARM/debug-frame.ll          |  4 +-
 llvm/test/CodeGen/ARM/ehabi.ll                |  6 +-
 llvm/test/CodeGen/ARM/fp16-vector-argument.ll |  6 +-
 llvm/test/CodeGen/ARM/tail-call-stack-args.ll | 63 +++++++++++++++++++
 llvm/test/CodeGen/Thumb2/mve-be.ll            |  6 --
 6 files changed, 115 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/tail-call-stack-args.ll

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2bac1d0086041..7ac739fed9028 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2422,6 +2422,45 @@ static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
 }
 
+/// Check whether a stack argument requires lowering in a tail call.
+static bool shouldLowerTailCallStackArg(const MachineFunction &MF,
+                                        const CCValAssign &VA, SDValue Arg,
+                                        ISD::ArgFlagsTy Flags, int CallOffset) {
+  // FIXME: We should be able to handle this case, but it's not clear how to.
+  if (Flags.isZExt() || Flags.isSExt())
+    return true;
+
+  for (;;) {
+    // Look through nodes that don't alter the bits of the incoming value.
+    unsigned Op = Arg.getOpcode();
+    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
+        Op == ISD::AssertZext || Op == ISD::AssertSext) {
+      Arg = Arg.getOperand(0);
+      continue;
+    }
+    break;
+  }
+
+  // If the argument is a load from the same immutable stack slot, we can reuse
+  // it.
+  if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
+    if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
+      const MachineFrameInfo &MFI = MF.getFrameInfo();
+      int FI = FINode->getIndex();
+      if (!MFI.isImmutableObjectIndex(FI))
+        return true;
+      if (CallOffset != MFI.getObjectOffset(FI))
+        return true;
+      uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
+      if (SizeInBits / 8 != MFI.getObjectSize(FI))
+        return true;
+      return false;
+    }
+  }
+
+  return true;
+}
+
 /// LowerCall - Lowering a call into a callseq_start <-
 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
 /// nodes.
@@ -2783,6 +2822,13 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       }
     } else {
       assert(VA.isMemLoc());
+
+      // When the frame pointer is perfectly aligned for the tail call and the
+      // same stack argument is passed down intact, we can reuse it.
+      if (!SPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags,
+                                                  VA.getLocMemOffset()))
+        continue;
+
       SDValue DstAddr;
       MachinePointerInfo DstInfo;
       std::tie(DstAddr, DstInfo) =
diff --git a/llvm/test/CodeGen/ARM/debug-frame.ll b/llvm/test/CodeGen/ARM/debug-frame.ll
index 72e7cfcab487a..474bcae54921d 100644
--- a/llvm/test/CodeGen/ARM/debug-frame.ll
+++ b/llvm/test/CodeGen/ARM/debug-frame.ll
@@ -175,7 +175,7 @@ declare void @_ZSt9terminatev()
 ; CHECK-FP:   .cfi_offset r4, -36
 ; CHECK-FP:   add    r11, sp, #28
 ; CHECK-FP:   .cfi_def_cfa r11, 8
-; CHECK-FP:   sub    sp, sp, #44
+; CHECK-FP:   sub    sp, sp, #36
 ; CHECK-FP:   .cfi_endproc
 
 ; CHECK-FP-ELIM-LABEL: _Z4testiiiiiddddd:
@@ -240,7 +240,7 @@ declare void @_ZSt9terminatev()
 ; CHECK-THUMB-FP:   .cfi_offset r4, -20
 ; CHECK-THUMB-FP:   add    r7, sp, #12
 ; CHECK-THUMB-FP:   .cfi_def_cfa r7, 8
-; CHECK-THUMB-FP:   sub    sp, #60
+; CHECK-THUMB-FP:   sub    sp, #52
 ; CHECK-THUMB-FP:   .cfi_endproc
 
 ; CHECK-THUMB-FP-ELIM-LABEL: _Z4testiiiiiddddd:
diff --git a/llvm/test/CodeGen/ARM/ehabi.ll b/llvm/test/CodeGen/ARM/ehabi.ll
index d1a4e9a6bccad..6626bd34379ca 100644
--- a/llvm/test/CodeGen/ARM/ehabi.ll
+++ b/llvm/test/CodeGen/ARM/ehabi.ll
@@ -167,8 +167,8 @@ declare void @_ZSt9terminatev()
 ; CHECK-FP:   push   {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-FP:   .setfp r11, sp, #28
 ; CHECK-FP:   add    r11, sp, #28
-; CHECK-FP:   .pad   #44
-; CHECK-FP:   sub    sp, sp, #44
+; CHECK-FP:   .pad   #36
+; CHECK-FP:   sub    sp, sp, #36
 ; CHECK-FP:   .personality __gxx_personality_v0
 ; CHECK-FP:   .handlerdata
 ; CHECK-FP:   .fnend
@@ -226,7 +226,7 @@ declare void @_ZSt9terminatev()
 ; DWARF-FP:    .cfi_offset r4, -36
 ; DWARF-FP:    add r11, sp, #28
 ; DWARF-FP:    .cfi_def_cfa r11, 8
-; DWARF-FP:    sub sp, sp, #44
+; DWARF-FP:    sub sp, sp, #36
 ; DWARF-FP:    sub sp, r11, #28
 ; DWARF-FP:    pop {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; DWARF-FP:    mov pc, lr
diff --git a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
index 65aff46658fd1..73bad383eadd5 100644
--- a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
+++ b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
@@ -155,9 +155,7 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
 ; SOFT-NEXT:    vld1.64 {d18, d19}, [r12]
 ; SOFT-NEXT:    add r12, sp, #16
 ; SOFT-NEXT:    vmul.f16 q8, q9, q8
-; SOFT-NEXT:    vst1.64 {d16, d17}, [r12]
-; SOFT-NEXT:    vldr d16, [sp]
-; SOFT-NEXT:    vstr d16, [sp]
+; SOFT-NEXT:    vst1.64 {d16, d17}, [r12] 
 ; SOFT-NEXT:    str r3, [sp, #8]
 ; SOFT-NEXT:    b use
 ;
@@ -185,10 +183,8 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
 ; SOFTEB-NEXT:    add r12, sp, #16
 ; SOFTEB-NEXT:    vrev64.16 q9, q9
 ; SOFTEB-NEXT:    vmul.f16 q8, q9, q8
-; SOFTEB-NEXT:    vldr d18, [sp]
 ; SOFTEB-NEXT:    vrev64.16 q8, q8
 ; SOFTEB-NEXT:    vst1.64 {d16, d17}, [r12]
-; SOFTEB-NEXT:    vstr d18, [sp]
 ; SOFTEB-NEXT:    str r3, [sp, #8]
 ; SOFTEB-NEXT:    b use
 ;
diff --git a/llvm/test/CodeGen/ARM/tail-call-stack-args.ll b/llvm/test/CodeGen/ARM/tail-call-stack-args.ll
new file mode 100644
index 0000000000000..ecad242e830b4
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/tail-call-stack-args.ll
@@ -0,0 +1,63 @@
+; RUN: llc %s -o - | FileCheck %s
+
+; Tail calls which have stack arguments in the same offsets as the caller do not
+; need to load and store the arguments from the stack.
+
+target triple = "armv7"
+
+declare void @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j)
+
+; CHECK-LABEL: wrapper_func:
+define void @wrapper_func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j) {
+  ; CHECK: @ %bb.
+  ; CHECK-NEXT: b func
+  tail call void @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j)
+  ret void
+}
+
+; CHECK-LABEL: wrapper_func_zero_arg:
+define void @wrapper_func_zero_arg(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j) {
+  ; CHECK: @ %bb.
+  ; CHECK-NEXT: mov r12, #0
+  ; CHECK-NEXT: r12, [sp, #20]
+  ; CHECK-NEXT: b func
+  tail call void @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: wrapper_func_overriden_arg:
+define void @wrapper_func_overriden_arg(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j) {
+  ; CHECK: @ %bb.
+  ; CHECK-NEXT: push {r11, lr}
+  ; CHECK-NEXT: ldr r12, [sp, #24]
+  ; CHECK-NEXT: mov lr, #0
+  ; CHECK-NEXT: str lr, [sp, #24]
+  ; CHECK-NEXT: str r12, [sp, #28]
+  ; CHECK-NEXT: pop {r11, lr}
+  ; CHECK-NEXT: b func
+  tail call void @func(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 0, i32 %i)
+  ret void
+}
+
+declare void @func_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 %j)
+
+; CHECK-LABEL: wrapper_func_i1:
+define void @wrapper_func_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 %j) {
+  ; CHECK: @ %bb.
+  ; CHECK-NEXT: b func_i1
+  tail call void @func_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 %j)
+  ret void
+}
+
+declare void @func_signext_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 signext %j)
+
+; FIXME: Support zero/sign-extended stack arguments.
+; CHECK-LABEL: wrapper_func_i8:
+define void @wrapper_func_i8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 signext %j) {
+  ; CHECK: @ %bb.
+  ; CHECK-NEXT: ldr r12, [sp, #20]
+  ; CHECK-NEXT: str r12, [sp, #20]
+  ; CHECK-NEXT: b func_signext_i1
+  tail call void @func_signext_i1(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 signext %j)
+  ret void
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-be.ll b/llvm/test/CodeGen/Thumb2/mve-be.ll
index e1db733b13b41..b20dff3d90d28 100644
--- a/llvm/test/CodeGen/Thumb2/mve-be.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-be.ll
@@ -121,9 +121,6 @@ define <4 x i32> @call_soft(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LE-NEXT:    push {r7, lr}
 ; CHECK-LE-NEXT:    .pad #16
 ; CHECK-LE-NEXT:    sub sp, #16
-; CHECK-LE-NEXT:    add.w r12, sp, #24
-; CHECK-LE-NEXT:    vldrw.u32 q0, [r12]
-; CHECK-LE-NEXT:    vstrw.32 q0, [sp]
 ; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vmov d0, r0, r1
 ; CHECK-LE-NEXT:    vshr.u32 q0, q0, #1
@@ -144,9 +141,6 @@ define <4 x i32> @call_soft(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-BE-NEXT:    push {r7, lr}
 ; CHECK-BE-NEXT:    .pad #16
 ; CHECK-BE-NEXT:    sub sp, #16
-; CHECK-BE-NEXT:    add.w r12, sp, #24
-; CHECK-BE-NEXT:    vldrw.u32 q0, [r12]
-; CHECK-BE-NEXT:    vstrw.32 q0, [sp]
 ; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0



More information about the llvm-commits mailing list