[llvm] [TailDup] Delay aggressive computed-goto taildup to after RegAlloc. (PR #150911)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 28 02:37:39 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Florian Hahn (fhahn)
<details>
<summary>Changes</summary>
https://github.com/llvm/llvm-project/pull/114990 allowed more aggressive
tail duplication for computed-gotos in both pre- and post-regalloc tail
duplication.
In some cases, performing tail-duplication too early can lead to worse
results, especially if we duplicate blocks with a number of phi nodes.
This is causing a ~3% performance regression in some workloads using
Python 3.12.
This patch updates TailDup to delay aggressive tail-duplication for
computed gotos to after register allocation.
This means we can keep the non-duplicated version for a bit longer
throughout the backend, which should reduce compile-time as well as
allowing a number of optimizations and simplifications to trigger before
drastically expanding the CFG.
For the case in https://github.com/llvm/llvm-project/issues/106846, I
get the same performance with and without this patch on Skylake.
---
Full diff: https://github.com/llvm/llvm-project/pull/150911.diff
3 Files Affected:
- (modified) llvm/include/llvm/CodeGen/MachineBasicBlock.h (+1-1)
- (modified) llvm/lib/CodeGen/TailDuplicator.cpp (+10-6)
- (renamed) llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir (+23-21)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 938d71dd030e8..39345ced8ce9a 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -327,7 +327,7 @@ class MachineBasicBlock
/// typically corresponds to a `goto` in C, rather than jump tables.
bool terminatorIsComputedGoto() const {
return back().isIndirectBranch() &&
- llvm::all_of(successors(), [](const MachineBasicBlock *Succ) {
+ !succ_empty() && llvm::all_of(successors(), [](const MachineBasicBlock *Succ) {
return Succ->isIRBlockAddressTaken();
});
}
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index a88c57fdc165a..7f0488e4c63fd 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -610,6 +610,15 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
if (HasIndirectbr && PreRegAlloc)
MaxDuplicateCount = TailDupIndirectBranchSize;
+ // Allow higher limits when the block has computed-gotos and running after
+ // register allocation. NB. This basically unfactors computed gotos that were
+ // factored early on in the compilation process to speed up edge based data
+ // flow. If we do not unfactor them again, it can seriously pessimize code
+ // with many computed jumps in the source code, such as interpreters.
+ // Therefore we do not restrict the computed gotos.
+ if (HasComputedGoto && !PreRegAlloc)
+ MaxDuplicateCount = std::max(MaxDuplicateCount, 10u);
+
// Check the instructions in the block to determine whether tail-duplication
// is invalid or unlikely to be profitable.
unsigned InstrCount = 0;
@@ -663,12 +672,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
// Duplicating a BB which has both multiple predecessors and successors will
// may cause huge amount of PHI nodes. If we want to remove this limitation,
// we have to address https://github.com/llvm/llvm-project/issues/78578.
- // NB. This basically unfactors computed gotos that were factored early on in
- // the compilation process to speed up edge based data flow. If we do not
- // unfactor them again, it can seriously pessimize code with many computed
- // jumps in the source code, such as interpreters. Therefore we do not
- // restrict the computed gotos.
- if (!HasComputedGoto && TailBB.pred_size() > TailDupPredSize &&
+ if (PreRegAlloc && TailBB.pred_size() > TailDupPredSize &&
TailBB.succ_size() > TailDupSuccSize) {
// If TailBB or any of its successors contains a phi, we may have to add a
// large number of additional phis with additional incoming values.
diff --git a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
similarity index 93%
rename from llvm/test/CodeGen/X86/tail-dup-computed-goto.mir
rename to llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
index 17de405928d37..0f2896463a8af 100644
--- a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir
+++ b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
@@ -1,6 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s
-# Check that only the computed goto is not be restrict by tail-dup-pred-size and tail-dup-succ-size.
+#
+# Check that only the computed goto and others are restricted by tail-dup-pred-size and tail-dup-succ-size.
+#
--- |
@computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)]
declare i64 @f0()
@@ -30,54 +32,54 @@ tracksRegLiveness: true
body: |
; CHECK-LABEL: name: computed_goto
; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+ ; CHECK-NEXT: successors: %bb.5(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rax
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]]
- ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY]], @computed_goto.dispatch, $noreg
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rax
+ ; CHECK-NEXT: JMP_1 %bb.5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
- ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+ ; CHECK-NEXT: successors: %bb.5(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY $rax
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY [[COPY2]]
- ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rax
+ ; CHECK-NEXT: JMP_1 %bb.5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
- ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+ ; CHECK-NEXT: successors: %bb.5(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64_nosp = COPY $rax
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64_nosp = COPY [[COPY4]]
- ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY4]], @computed_goto.dispatch, $noreg
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rax
+ ; CHECK-NEXT: JMP_1 %bb.5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
- ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+ ; CHECK-NEXT: successors: %bb.5(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = COPY $rax
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[COPY6]]
- ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY6]], @computed_goto.dispatch, $noreg
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY $rax
+ ; CHECK-NEXT: JMP_1 %bb.5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
- ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+ ; CHECK-NEXT: successors: %bb.5(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64_nosp = COPY $rax
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = COPY [[COPY8]]
- ; CHECK-NEXT: JMP64m $noreg, 8, [[COPY8]], @computed_goto.dispatch, $noreg
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY $rax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.5:
+ ; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:gr64_nosp = PHI [[COPY]], %bb.0, [[COPY4]], %bb.4, [[COPY3]], %bb.3, [[COPY2]], %bb.2, [[COPY1]], %bb.1
+ ; CHECK-NEXT: JMP64m $noreg, 8, [[PHI]], @computed_goto.dispatch, $noreg
bb.0:
ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
``````````
</details>
https://github.com/llvm/llvm-project/pull/150911
More information about the llvm-commits
mailing list