[llvm] [x86] Enable indirect tail calls with more arguments (PR #137643)
Hans Wennborg via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 28 07:49:18 PDT 2025
https://github.com/zmodem created https://github.com/llvm/llvm-project/pull/137643
X86ISelDAGToDAG's `isCalleeLoad` / `moveBelowOrigChain` tries to move the load instruction next to the call so they can be folded, but it would only allow a single CopyToReg node in between.
This patch makes it look through multiple CopyToReg, while being careful to only perform the transformation when the load+call can be folded.
Fixes #136848
>From 0d00d2b6d446ec4d0191bdaa5b86fbf7aa0f3b93 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans at chromium.org>
Date: Fri, 25 Apr 2025 17:08:52 +0200
Subject: [PATCH 1/2] (WORK IN PROGRESS) try to tail call address computation
w/ more than two args
this folds more stuff, but also finds new breakages
Fixes #136848
---
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 35 ++++++++++++++++---------
llvm/test/CodeGen/X86/cfguard-checks.ll | 3 +--
llvm/test/CodeGen/X86/fold-call-4.ll | 16 +++++++++++
3 files changed, 39 insertions(+), 15 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/fold-call-4.ll
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 01118beb9cf5e..be9ca87b781ef 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -897,20 +897,29 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
Chain = Chain.getOperand(0);
}
- if (!Chain.getNumOperands())
- return false;
- // Since we are not checking for AA here, conservatively abort if the chain
- // writes to memory. It's not safe to move the callee (a load) across a store.
- if (isa<MemSDNode>(Chain.getNode()) &&
- cast<MemSDNode>(Chain.getNode())->writeMem())
+ while (true) {
+ if (!Chain.getNumOperands())
+ return false;
+ // Since we are not checking for AA here, conservatively abort if the chain
+ // writes to memory. It's not safe to move the callee (a load) across a store.
+ if (isa<MemSDNode>(Chain.getNode()) &&
+ cast<MemSDNode>(Chain.getNode())->writeMem())
+ return false;
+
+ if (Chain.getOperand(0).getNode() == Callee.getNode())
+ return true;
+ if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+ Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+ Callee.getValue(1).hasOneUse())
+ return true;
+
+ // Look past CopyToReg's.
+ if (Chain.getOperand(0).getOpcode() == ISD::CopyToReg) {
+ Chain = Chain.getOperand(0);
+ continue;
+ }
return false;
- if (Chain.getOperand(0).getNode() == Callee.getNode())
- return true;
- if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
- Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
- Callee.getValue(1).hasOneUse())
- return true;
- return false;
+ }
}
static bool isEndbrImm64(uint64_t Imm) {
diff --git a/llvm/test/CodeGen/X86/cfguard-checks.ll b/llvm/test/CodeGen/X86/cfguard-checks.ll
index a727bbbfdcbe3..db19efaf910a3 100644
--- a/llvm/test/CodeGen/X86/cfguard-checks.ll
+++ b/llvm/test/CodeGen/X86/cfguard-checks.ll
@@ -210,8 +210,7 @@ entry:
; X64-LABEL: vmptr_thunk:
; X64: movq (%rcx), %rax
; X64-NEXT: movq 8(%rax), %rax
- ; X64-NEXT: movq __guard_dispatch_icall_fptr(%rip), %rdx
- ; X64-NEXT: rex64 jmpq *%rdx # TAILCALL
+ ; X64-NEXT: rex64 jmpq *__guard_dispatch_icall_fptr(%rip) # TAILCALL
; X64-NOT: callq
}
diff --git a/llvm/test/CodeGen/X86/fold-call-4.ll b/llvm/test/CodeGen/X86/fold-call-4.ll
new file mode 100644
index 0000000000000..d22dfc1759613
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fold-call-4.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; The callee address computation should get folded into the call.
+; CHECK-LABEL: f:
+; CHECK-NOT: mov
+; CHECK: jmpq *(%rdi,%rsi,8)
+
+define void @f(ptr %table, i64 %idx) {
+entry:
+ %arrayidx = getelementptr inbounds ptr, ptr %table, i64 %idx
+ %funcptr = load ptr, ptr %arrayidx, align 8
+ tail call void %funcptr(ptr %table, i64 %idx)
+ ret void
+}
>From 4913eea23129249ad20cd24dfe28e271d41e8391 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans at chromium.org>
Date: Mon, 28 Apr 2025 16:03:48 +0200
Subject: [PATCH 2/2] fixes
---
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 28 +++++++++++++++++++++++--
llvm/test/CodeGen/X86/fold-call-4.ll | 9 +++-----
2 files changed, 29 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index be9ca87b781ef..7d6359f701368 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -890,6 +890,12 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
LD->getExtensionType() != ISD::NON_EXTLOAD)
return false;
+ // If the load's outgoing chain has more than one use, we can't (currently)
+ // move the load since we'd most likely create a loop. TODO: Maybe it could
+ // work if moveBelowOrigChain() updated *all* the chain users.
+ if (!Callee.getValue(1).hasOneUse())
+ return false;
+
// Now let's find the callseq_start.
while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
if (!Chain.hasOneUse())
@@ -913,11 +919,13 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
Callee.getValue(1).hasOneUse())
return true;
- // Look past CopyToReg's.
- if (Chain.getOperand(0).getOpcode() == ISD::CopyToReg) {
+ // Look past CopyToRegs. We only walk one path, so the chain mustn't branch.
+ if (Chain.getOperand(0).getOpcode() == ISD::CopyToReg &&
+ Chain.getOperand(0).getValue(0).hasOneUse()) {
Chain = Chain.getOperand(0);
continue;
}
+
return false;
}
}
@@ -1362,6 +1370,22 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
(N->getOpcode() == X86ISD::TC_RETURN &&
(Subtarget->is64Bit() ||
!getTargetMachine().isPositionIndependent())))) {
+
+ if (N->getOpcode() == X86ISD::TC_RETURN) {
+ // There needs to be enough non-callee-saved GPRs available to compute
+ // the load address if folded into the tailcall. See how the
+ // X86tcret_6regs and X86tcret_1reg classes are used and defined.
+ unsigned NumRegs = 0;
+ for (unsigned I = 3, E = N->getNumOperands(); I != E; ++I) {
+ if (isa<RegisterSDNode>(N->getOperand(I)))
+ ++NumRegs;
+ }
+ if (!Subtarget->is64Bit() && NumRegs > 1)
+ continue;
+ if (NumRegs > 6)
+ continue;
+ }
+
/// Also try moving call address load from outside callseq_start to just
/// before the call to allow it to be folded.
///
diff --git a/llvm/test/CodeGen/X86/fold-call-4.ll b/llvm/test/CodeGen/X86/fold-call-4.ll
index d22dfc1759613..708e05a0bfff0 100644
--- a/llvm/test/CodeGen/X86/fold-call-4.ll
+++ b/llvm/test/CodeGen/X86/fold-call-4.ll
@@ -1,16 +1,13 @@
-; RUN: llc < %s | FileCheck %s
-
-target triple = "x86_64-unknown-linux-gnu"
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
; The callee address computation should get folded into the call.
; CHECK-LABEL: f:
; CHECK-NOT: mov
; CHECK: jmpq *(%rdi,%rsi,8)
-
-define void @f(ptr %table, i64 %idx) {
+define void @f(ptr %table, i64 %idx, i64 %aux1, i64 %aux2, i64 %aux3) {
entry:
%arrayidx = getelementptr inbounds ptr, ptr %table, i64 %idx
%funcptr = load ptr, ptr %arrayidx, align 8
- tail call void %funcptr(ptr %table, i64 %idx)
+ tail call void %funcptr(ptr %table, i64 %idx, i64 %aux1, i64 %aux2, i64 %aux3)
ret void
}
More information about the llvm-commits
mailing list