[llvm] [ARM] Be more precise about conditions for indirect tail-calls (PR #102451)

Thu Aug 8 05:09:57 PDT 2024

https://github.com/ostannard updated https://github.com/llvm/llvm-project/pull/102451

>From aa36ec311006ffdffaf5c13b0350092859edc412 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Thu, 8 Aug 2024 11:08:53 +0100
Subject: [PATCH 1/3] Pre-commit test

---
 .../indirect-tail-call-free-registers.ll      | 113 ++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll

diff --git a/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll b/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll
new file mode 100644
index 0000000000000..735907fb30578
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll
@@ -0,0 +1,113 @@
+; RUN: llc %s -o - -mtriple=thumbv8m.main -mattr=+vfp4 | FileCheck %s
+
+;; No outgoing arguments, plenty of free registers to hold the target address.
+define void @test0(ptr %fptr) {
+; CHECK-LABEL: test0:
+; CHECK: bx {{r0|r1|r2|r3|r12}}
+entry:
+  tail call void %fptr()
+  ret void
+}
+
+;; Four integer outgoing arguments, which use up r0-r3.
+define void @test1(ptr %fptr) {
+; CHECK-LABEL: test1:
+; CHECK: bx r12
+entry:
+  tail call void %fptr(i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+;; Four integer outgoing arguments, which use up r0-r3, and sign-return-address
+;; uses r12, so we can never tail-call this.
+define void @test2(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test2:
+; CHECK: blx
+  entry:
+  tail call void %fptr(i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+;; An i32 and an i64 argument, which uses r0, r2 and r3 for arguments, leaving
+;; r1 free for the address.
+define void @test3(ptr %fptr) {
+; CHECK-LABEL: test3:
+; CHECK: bx {{r1|r12}}
+entry:
+  tail call void %fptr(i32 0, i64 0)
+  ret void
+}
+
+;; Four float arguments, using the soft-float calling convention, which uses
+;; r0-r3.
+define void @test4(ptr %fptr) {
+; CHECK-LABEL: test4:
+; CHECK: bx r12
+entry:
+  tail call arm_aapcscc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the soft-float calling convention, which uses
+;; r0-r3, and sign-return-address uses r12. Currently fails with "ran out of
+;; registers during register allocation".
+define void @test5(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test5:
+; CHECK: blx
+entry:
+  tail call arm_aapcscc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the hard-float calling convention, which uses
+;; s0-s3, leaving the all of the integer registers free for the address.
+define void @test6(ptr %fptr) {
+; CHECK-LABEL: test6:
+; CHECK: bx {{r0|r1|r2|r3|r12}}
+entry:
+  tail call arm_aapcs_vfpcc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the hard-float calling convention, which uses
+;; s0-s3, leaving r0-r3 free for the address, with r12 used for
+;; sign-return-address.
+;; We could tail-call this, but currently don't.
+define void @test7(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test7:
+; CHECK: blx
+entry:
+  tail call arm_aapcs_vfpcc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Two double arguments, using the soft-float calling convention, which uses
+;; r0-r3.
+define void @test8(ptr %fptr) {
+; CHECK-LABEL: test8:
+; CHECK: bx r12
+entry:
+  tail call arm_aapcscc void %fptr(double 0.0, double 0.0)
+  ret void
+}
+
+;; Two double arguments, using the soft-float calling convention, which uses
+;; r0-r3, and sign-return-address uses r12, so we can't tail-call this.
+;; This currently crashes with "error: ran out of registers during register
+;; allocation"
+;define void @test9(ptr %fptr) "sign-return-address"="all" {
+;entry:
+;  tail call arm_aapcscc void %fptr(double 0.0, double 0.0)
+;  ret void
+;}
+
+;; Four integer arguments (one on the stack), but dut to alignment r1 is left
+;; empty, so can be used for the tail-call.
+;; We could tail-call this, but currently don't.
+define void @test10(ptr %fptr, i64 %b, i32 %c) "sign-return-address"="all" {
+; CHECK-LABEL: test10:
+; CHECK: blx
+entry:
+  tail call void %fptr(i32 0, i64 %b, i32 %c)
+  ret void
+}

>From 330ee96157c624ab6bb50b7e3f578b740625623b Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Thu, 8 Aug 2024 11:33:24 +0100
Subject: [PATCH 2/3] [ARM] Be more precise about conditions for indirect
 tail-calls

This code was trying to predict the conditions in which an indirect
tail call will have a free register to hold the target address, and
falling back to a non-tail call if all non-callee-saved registers are
used for arguments or return address authentication.

However, it was only taking the number of arguments into account, not
which registers they are allocated to, so floating-point arguments could
cause this to give the wrong result, causing either a later error due to
the lack of a free register, or a missed optimisation of not doing the
tail call.

The assignments of arguments to registers is available at this point in
the code, so we can calculate exactly which registers will be available
for the tail-call.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp       | 33 ++++++++++++-------
 .../indirect-tail-call-free-registers.ll      | 20 +++++------
 2 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 75d16a42d0205..75b9e040a8633 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3024,18 +3024,27 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
 
   assert(Subtarget->supportsTailCall());
 
-  // Indirect tail calls cannot be optimized for Thumb1 if the args
-  // to the call take up r0-r3. The reason is that there are no legal registers
-  // left to hold the pointer to the function to be called.
-  // Similarly, if the function uses return address sign and authentication,
-  // r12 is needed to hold the PAC and is not available to hold the callee
-  // address.
-  if (Outs.size() >= 4 &&
-      (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
-    if (Subtarget->isThumb1Only())
-      return false;
-    // Conservatively assume the function spills LR.
-    if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true))
+  // Indirect tail-calls require a register to hold the target address. That
+  // register must be:
+  // * Allocatable (i.e. r0-r7 if the target is Thumb1).
+  // * Not callee-saved, so must be one of r0-r3 or r12.
+  // * Not used to hold an argument to the tail-called function, which might be
+  //   in r0-r3.
+  // * Not used to hold the return address authentication code, which is in r12
+  //   if enabled.
+  // Sometimes, no register matches all of these conditions, so we can't do a
+  // tail-call.
+  if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
+    SmallSet<MCPhysReg, 5> AddressRegisters;
+    for (Register R : {ARM::R0, ARM::R1, ARM::R2, ARM::R3})
+      AddressRegisters.insert(R);
+    if (!(Subtarget->isThumb1Only() or
+          MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
+      AddressRegisters.insert(ARM::R12);
+    for (const CCValAssign &AL : ArgLocs)
+      if (AL.isRegLoc())
+        AddressRegisters.erase(AL.getLocReg());
+    if (AddressRegisters.empty())
       return false;
   }
 
diff --git a/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll b/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll
index 735907fb30578..c6ace3eb55b28 100644
--- a/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll
+++ b/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll
@@ -72,10 +72,9 @@ entry:
 ;; Four float arguments, using the hard-float calling convention, which uses
 ;; s0-s3, leaving r0-r3 free for the address, with r12 used for
 ;; sign-return-address.
-;; We could tail-call this, but currently don't.
 define void @test7(ptr %fptr) "sign-return-address"="all" {
 ; CHECK-LABEL: test7:
-; CHECK: blx
+; CHECK: bx {{r0|r1|r2|r3}}
 entry:
   tail call arm_aapcs_vfpcc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
   ret void
@@ -93,20 +92,19 @@ entry:
 
 ;; Two double arguments, using the soft-float calling convention, which uses
 ;; r0-r3, and sign-return-address uses r12, so we can't tail-call this.
-;; This currently crashes with "error: ran out of registers during register
-;; allocation"
-;define void @test9(ptr %fptr) "sign-return-address"="all" {
-;entry:
-;  tail call arm_aapcscc void %fptr(double 0.0, double 0.0)
-;  ret void
-;}
+define void @test9(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test9:
+; CHECK: blx
+entry:
+  tail call arm_aapcscc void %fptr(double 0.0, double 0.0)
+  ret void
+}
 
 ;; Four integer arguments (one on the stack), but dut to alignment r1 is left
 ;; empty, so can be used for the tail-call.
-;; We could tail-call this, but currently don't.
 define void @test10(ptr %fptr, i64 %b, i32 %c) "sign-return-address"="all" {
 ; CHECK-LABEL: test10:
-; CHECK: blx
+; CHECK: bx r1
 entry:
   tail call void %fptr(i32 0, i64 %b, i32 %c)
   ret void

>From 1b3550591064e5a5d9749fc16ab646b2d1517c8e Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Thu, 8 Aug 2024 13:09:39 +0100
Subject: [PATCH 3/3] Don't use iso646 or operator

---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 75b9e040a8633..476b7b349294a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3038,7 +3038,7 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
     SmallSet<MCPhysReg, 5> AddressRegisters;
     for (Register R : {ARM::R0, ARM::R1, ARM::R2, ARM::R3})
       AddressRegisters.insert(R);
-    if (!(Subtarget->isThumb1Only() or
+    if (!(Subtarget->isThumb1Only() ||
           MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
       AddressRegisters.insert(ARM::R12);
     for (const CCValAssign &AL : ArgLocs)