[llvm] 1f9fa54 - [Taildup] Don't tail-duplicate loop header with multiple successors as its latches

Mon Nov 1 00:33:04 PDT 2021

Author: Jun Ma
Date: 2021-11-01T15:32:00+08:00
New Revision: 1f9fa549841a2ec55aa5a131bfaf83f0383c4713

URL: https://github.com/llvm/llvm-project/commit/1f9fa549841a2ec55aa5a131bfaf83f0383c4713
DIFF: https://github.com/llvm/llvm-project/commit/1f9fa549841a2ec55aa5a131bfaf83f0383c4713.diff

LOG: [Taildup] Don't tail-duplicate loop header with multiple successors as its latches

when Taildup hit loop with multiple latches like:
  //    1 -> 2 <-> 3                 |
  //          \  <-> 4               |
  //           \   <-> 5             |
  //            \---> rest           |
it may transform this loop into multiple loops by duplicate loop header.
However, this change may has little benefit while makes cfg much complex.
In some uncommon cases, it causes large compile time regression (offered by
@alexfh in D106056).

This patch disable tail-duplicate of such cases.

TestPlan: check-llvm

Differential Revision: https://reviews.llvm.org/D110613

Added: 
    

Modified: 
    llvm/lib/CodeGen/TailDuplicator.cpp
    llvm/test/CodeGen/X86/tail-dup-multiple-latch-loop.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 1fb7128a6decb..806cb17e3036f 100644

--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -70,6 +70,12 @@ static cl::opt<unsigned> TailDupIndirectBranchSize(
              "end with indirect branches."), cl::init(20),
     cl::Hidden);
 
+static cl::opt<unsigned> TailDupJmpTableLoopSize(
+    "tail-dup-jmptable-loop-size",
+    cl::desc("Maximum loop latches to consider tail duplication that are "
+             "successors of loop header."),
+    cl::init(128), cl::Hidden);
+
 static cl::opt<bool>
     TailDupVerify("tail-dup-verify",
                   cl::desc("Verify sanity of PHI instructions during taildup"),
@@ -563,6 +569,29 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   if (TailBB.isSuccessor(&TailBB))
     return false;
 
+  // When doing tail-duplication with jumptable loops like:
+  //    1 -> 2 <-> 3                 |
+  //          \  <-> 4               |
+  //           \   <-> 5             |
+  //            \    <-> ...         |
+  //             \---> rest          |
+  // quadratic number of edges and much more loops are added to CFG. This
+  // may cause compile time regression when jumptable is quiet large.
+  // So set the limit on jumptable cases.
+  auto isLargeJumpTableLoop = [](const MachineBasicBlock &TailBB) {
+    const SmallPtrSet<const MachineBasicBlock *, 8> Preds(TailBB.pred_begin(),
+                                                          TailBB.pred_end());
+    // Check the basic block has large number of successors, all of them only
+    // have one successor which is the basic block itself.
+    return llvm::count_if(
+               TailBB.successors(), [&](const MachineBasicBlock *SuccBB) {
+                 return Preds.count(SuccBB) && SuccBB->succ_size() == 1;
+               }) > TailDupJmpTableLoopSize;
+  };
+
+  if (isLargeJumpTableLoop(TailBB))
+    return false;
+
   // Set the limit on the cost to duplicate. When optimizing for size,
   // duplicate only one, because one branch instruction can be eliminated to
   // compensate for the duplication.

diff  --git a/llvm/test/CodeGen/X86/tail-dup-multiple-latch-loop.ll b/llvm/test/CodeGen/X86/tail-dup-multiple-latch-loop.ll
index 2032c7244331c..25da377ec487b 100644
--- a/llvm/test/CodeGen/X86/tail-dup-multiple-latch-loop.ll
+++ b/llvm/test/CodeGen/X86/tail-dup-multiple-latch-loop.ll
@@ -1,76 +1,48 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -tail-dup-jmptable-loop-size=5 -mtriple=x86_64-unknown-linux-gnu  | FileCheck %s
 define i8* @large_loop_switch(i8* %p) {
 ; CHECK-LABEL: large_loop_switch:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset %rbx, -16
-; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq %rdi, %rsi
 ; CHECK-NEXT:    movl $6, %ebx
-; CHECK-NEXT:    movl %ebx, %ecx
-; CHECK-NEXT:    jmpq *.LJTI0_0(,%rcx,8)
-; CHECK-NEXT:  .LBB0_1: # %for.cond.cleanup
-; CHECK-NEXT:    movl $530, %edi # imm = 0x212
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    jmp ccc at PLT # TAILCALL
-; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    jmpq *.LJTI0_0(,%rax,8)
 ; CHECK-NEXT:  .LBB0_2: # %sw.bb1
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    movl $531, %edi # imm = 0x213
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:  .LBB0_3: # %for.body
 ; CHECK-NEXT:    callq ccc at PLT
+; CHECK-NEXT:  .LBB0_4: # %for.body
+; CHECK-NEXT:    movq %rax, %rsi
 ; CHECK-NEXT:    decl %ebx
-; CHECK-NEXT:    movl %ebx, %ecx
-; CHECK-NEXT:    jmpq *.LJTI0_0(,%rcx,8)
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_3: # %sw.bb3
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    jmpq *.LJTI0_0(,%rax,8)
+; CHECK-NEXT:  .LBB0_5: # %sw.bb3
 ; CHECK-NEXT:    movl $532, %edi # imm = 0x214
-; CHECK-NEXT:    movq %rax, %rsi
 ; CHECK-NEXT:    callq bbb at PLT
-; CHECK-NEXT:    decl %ebx
-; CHECK-NEXT:    movl %ebx, %ecx
-; CHECK-NEXT:    jmpq *.LJTI0_0(,%rcx,8)
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_4: # %sw.bb5
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    jmp .LBB0_4
+; CHECK-NEXT:  .LBB0_7: # %sw.bb5
 ; CHECK-NEXT:    movl $533, %edi # imm = 0x215
-; CHECK-NEXT:    movq %rax, %rsi
 ; CHECK-NEXT:    callq bbb at PLT
-; CHECK-NEXT:    decl %ebx
-; CHECK-NEXT:    movl %ebx, %ecx
-; CHECK-NEXT:    jmpq *.LJTI0_0(,%rcx,8)
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_5: # %sw.bb7
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    jmp .LBB0_4
+; CHECK-NEXT:  .LBB0_8: # %sw.bb7
 ; CHECK-NEXT:    movl $535, %edi # imm = 0x217
-; CHECK-NEXT:    movq %rax, %rsi
 ; CHECK-NEXT:    callq bbb at PLT
-; CHECK-NEXT:    decl %ebx
-; CHECK-NEXT:    movl %ebx, %ecx
-; CHECK-NEXT:    jmpq *.LJTI0_0(,%rcx,8)
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_6: # %sw.bb9
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    jmp .LBB0_4
+; CHECK-NEXT:  .LBB0_9: # %sw.bb9
 ; CHECK-NEXT:    movl $536, %edi # imm = 0x218
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    callq ccc at PLT
-; CHECK-NEXT:    decl %ebx
-; CHECK-NEXT:    movl %ebx, %ecx
-; CHECK-NEXT:    jmpq *.LJTI0_0(,%rcx,8)
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_7: # %sw.bb11
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_10: # %sw.bb11
 ; CHECK-NEXT:    movl $658, %edi # imm = 0x292
-; CHECK-NEXT:    movq %rax, %rsi
 ; CHECK-NEXT:    callq bbb at PLT
-; CHECK-NEXT:    decl %ebx
-; CHECK-NEXT:    movl %ebx, %ecx
-; CHECK-NEXT:    jmpq *.LJTI0_0(,%rcx,8)
+; CHECK-NEXT:    jmp .LBB0_4
+; CHECK-NEXT:  .LBB0_11: # %for.cond.cleanup
+; CHECK-NEXT:    movl $530, %edi # imm = 0x212
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    jmp ccc at PLT # TAILCALL
 entry:
   br label %for.body