[llvm] [TaiDup] Allow large number of predecessors/successors without phis. (PR #116072)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 13 08:02:19 PST 2024


https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/116072

This adjusts the threshold logic added in #78582 to only trigger for cases where there are actually phis to duplicate in either TailBB or in one of the successors.

In cases there are no phis, we only have to pay the cost of extra edges, but have no explosion in PHI related instructions.

This improves performance of Python on some inputs by 2-3% on Apple Silicon CPUs.

>From d1452bdeb90082b08ab3d6c99d6c3372c77b8510 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 13 Nov 2024 15:34:52 +0000
Subject: [PATCH] [TaiDup] Allow large number of predecessors/successors
 without phis.

This adjusts the threshold logic added in #78582 to only trigger for
cases where there are actually phis to duplicate in either TailBB or in
one of the successors.

In cases there are no phis, we only have to pay the cost of extra edges,
but have no explosion in PHI related instructions.

This improves performance of Python on some inputs by 2-3% on Apple
Silicon CPUs.
---
 llvm/lib/CodeGen/TailDuplicator.cpp           | 23 ++++---
 .../CodeGen/X86/tail-dup-pred-succ-size.mir   | 60 ++++++++++---------
 2 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index d0013923838b35..a333122b2ba035 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -573,14 +573,6 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   if (TailBB.isSuccessor(&TailBB))
     return false;
 
-  // Duplicating a BB which has both multiple predecessors and successors will
-  // result in a complex CFG and also may cause huge amount of PHI nodes. If we
-  // want to remove this limitation, we have to address
-  // https://github.com/llvm/llvm-project/issues/78578.
-  if (TailBB.pred_size() > TailDupPredSize &&
-      TailBB.succ_size() > TailDupSuccSize)
-    return false;
-
   // Set the limit on the cost to duplicate. When optimizing for size,
   // duplicate only one, because one branch instruction can be eliminated to
   // compensate for the duplication.
@@ -618,6 +610,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   // Check the instructions in the block to determine whether tail-duplication
   // is invalid or unlikely to be profitable.
   unsigned InstrCount = 0;
+  unsigned NumPhis = 0;
   for (MachineInstr &MI : TailBB) {
     // Non-duplicable things shouldn't be tail-duplicated.
     // CFI instructions are marked as non-duplicable, because Darwin compact
@@ -661,6 +654,20 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
 
     if (InstrCount > MaxDuplicateCount)
       return false;
+    NumPhis += MI.isPHI();
+  }
+
+  // Duplicating a BB which has both multiple predecessors and successors will
+  // may cause huge amount of PHI nodes. If we want to remove this limitation,
+  // we have to address https://github.com/llvm/llvm-project/issues/78578.
+  if (TailBB.pred_size() > TailDupPredSize &&
+      TailBB.succ_size() > TailDupSuccSize) {
+    // If TailBB or any of its successors contains a phi, we may have to add a
+    // large number of additional phis with additional incoming values.
+    if (NumPhis != 0 || any_of(TailBB.successors(), [](MachineBasicBlock *MBB) {
+          return any_of(*MBB, [](MachineInstr &MI) { return MI.isPHI(); });
+        }))
+      return false;
   }
 
   // Check if any of the successors of TailBB has a PHI node in which the
diff --git a/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir b/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir
index 1d17672e2c6bd0..2f1ff76fda76c9 100644
--- a/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir
+++ b/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir
@@ -538,43 +538,47 @@ body:             |
   ; LIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG]], %jump-table.0, $noreg
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT: bb.2:
-  ; LIMIT-NEXT:   successors: %bb.7(0x80000000)
+  ; LIMIT-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
-  ; LIMIT-NEXT:   JMP_1 %bb.7
+  ; LIMIT-NEXT:   [[SHR32ri1:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[AND32ri1:%[0-9]+]]:gr32 = AND32ri [[SHR32ri1]], 7, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[SUBREG_TO_REG1:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri1]], %subreg.sub_32bit
+  ; LIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG1]], %jump-table.1, $noreg
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT: bb.3:
-  ; LIMIT-NEXT:   successors: %bb.7(0x80000000)
+  ; LIMIT-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT:   [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
-  ; LIMIT-NEXT:   [[SHR32ri1:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm1]], 1, implicit-def dead $eflags
-  ; LIMIT-NEXT:   JMP_1 %bb.7
+  ; LIMIT-NEXT:   [[SHR32ri2:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm1]], 1, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[SHR32ri3:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[AND32ri2:%[0-9]+]]:gr32 = AND32ri [[SHR32ri3]], 7, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[SUBREG_TO_REG2:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri2]], %subreg.sub_32bit
+  ; LIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG2]], %jump-table.1, $noreg
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT: bb.4:
-  ; LIMIT-NEXT:   successors: %bb.7(0x80000000)
+  ; LIMIT-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT:   [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
-  ; LIMIT-NEXT:   [[SHR32ri2:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm2]], 2, implicit-def dead $eflags
-  ; LIMIT-NEXT:   JMP_1 %bb.7
+  ; LIMIT-NEXT:   [[SHR32ri4:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm2]], 2, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[SHR32ri5:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[AND32ri3:%[0-9]+]]:gr32 = AND32ri [[SHR32ri5]], 7, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[SUBREG_TO_REG3:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri3]], %subreg.sub_32bit
+  ; LIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG3]], %jump-table.1, $noreg
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT: bb.5:
-  ; LIMIT-NEXT:   successors: %bb.7(0x80000000)
+  ; LIMIT-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT:   [[MOV32rm3:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
-  ; LIMIT-NEXT:   [[SHR32ri3:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm3]], 3, implicit-def dead $eflags
-  ; LIMIT-NEXT:   JMP_1 %bb.7
+  ; LIMIT-NEXT:   [[SHR32ri6:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm3]], 3, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[SHR32ri7:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[AND32ri4:%[0-9]+]]:gr32 = AND32ri [[SHR32ri7]], 7, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[SUBREG_TO_REG4:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri4]], %subreg.sub_32bit
+  ; LIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG4]], %jump-table.1, $noreg
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT: bb.6:
   ; LIMIT-NEXT:   successors:
   ; LIMIT-NEXT: {{  $}}
-  ; LIMIT-NEXT: bb.7:
-  ; LIMIT-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
-  ; LIMIT-NEXT: {{  $}}
-  ; LIMIT-NEXT:   [[SHR32ri4:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags
-  ; LIMIT-NEXT:   [[AND32ri1:%[0-9]+]]:gr32 = AND32ri [[SHR32ri4]], 7, implicit-def dead $eflags
-  ; LIMIT-NEXT:   [[SUBREG_TO_REG1:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, killed [[AND32ri1]], %subreg.sub_32bit
-  ; LIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG1]], %jump-table.1, $noreg
-  ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT: bb.9:
   ; LIMIT-NEXT:   [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
   ; LIMIT-NEXT:   MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[MOV32rm4]] :: (store (s32))
@@ -583,23 +587,23 @@ body:             |
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT: bb.10:
   ; LIMIT-NEXT:   [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
-  ; LIMIT-NEXT:   [[SHR32ri5:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm5]], 1, implicit-def dead $eflags
-  ; LIMIT-NEXT:   MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri5]] :: (store (s32))
-  ; LIMIT-NEXT:   $eax = COPY [[SHR32ri5]]
+  ; LIMIT-NEXT:   [[SHR32ri8:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm5]], 1, implicit-def dead $eflags
+  ; LIMIT-NEXT:   MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri8]] :: (store (s32))
+  ; LIMIT-NEXT:   $eax = COPY [[SHR32ri8]]
   ; LIMIT-NEXT:   RET 0, $eax
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT: bb.11:
   ; LIMIT-NEXT:   [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
-  ; LIMIT-NEXT:   [[SHR32ri6:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm6]], 2, implicit-def dead $eflags
-  ; LIMIT-NEXT:   MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri6]] :: (store (s32))
-  ; LIMIT-NEXT:   $eax = COPY [[SHR32ri6]]
+  ; LIMIT-NEXT:   [[SHR32ri9:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm6]], 2, implicit-def dead $eflags
+  ; LIMIT-NEXT:   MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri9]] :: (store (s32))
+  ; LIMIT-NEXT:   $eax = COPY [[SHR32ri9]]
   ; LIMIT-NEXT:   RET 0, $eax
   ; LIMIT-NEXT: {{  $}}
   ; LIMIT-NEXT: bb.12:
   ; LIMIT-NEXT:   [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
-  ; LIMIT-NEXT:   [[SHR32ri7:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm7]], 6, implicit-def dead $eflags
-  ; LIMIT-NEXT:   MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri7]] :: (store (s32))
-  ; LIMIT-NEXT:   $eax = COPY [[SHR32ri7]]
+  ; LIMIT-NEXT:   [[SHR32ri10:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm7]], 6, implicit-def dead $eflags
+  ; LIMIT-NEXT:   MOV32mr [[COPY1]], 1, $noreg, 0, $noreg, [[SHR32ri10]] :: (store (s32))
+  ; LIMIT-NEXT:   $eax = COPY [[SHR32ri10]]
   ; LIMIT-NEXT:   RET 0, $eax
   ;
   ; NOLIMIT-LABEL: name: foo_no_phis



More information about the llvm-commits mailing list