[llvm] r303084 - CodeGen: BlockPlacement: Increase tail duplication size for O3.

Vitaly Buka via llvm-commits llvm-commits at lists.llvm.org
Mon May 15 17:10:35 PDT 2017


+Dmitry Vyukov <dvyukov at google.com>

On Mon, May 15, 2017 at 5:09 PM Vitaly Buka <vitalybuka at google.com> wrote:

> This test is broken after the patch:
> http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux-autoconf/builds/8452/steps/tsan%20analyze/logs/stdio
>
> On Mon, May 15, 2017 at 10:44 AM Kyle Butt via llvm-commits <
> llvm-commits at lists.llvm.org> wrote:
>
>> Author: iteratee
>> Date: Mon May 15 12:30:47 2017
>> New Revision: 303084
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=303084&view=rev
>> Log:
>> CodeGen: BlockPlacement: Increase tail duplication size for O3.
>>
>> At O3 we are more willing to increase size if we believe it will improve
>> performance. The current threshold for tail-duplication of 2 instructions
>> is
>> conservative, and can be relaxed at O3.
>>
>> Benchmark results:
>> llvm test-suite:
>> 6% improvement in aha, due to duplication of loop latch
>> 3% improvement in hexxagon
>>
>> 2% slowdown in lpbench. Seems related, but couldn't completely diagnose.
>>
>> Internal google benchmark:
>> Produces 4% improvement on internal google protocol buffer serialization
>> benchmarks.
>>
>> Differential-Revision: https://reviews.llvm.org/D32324
>>
>> Modified:
>>     llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp
>>     llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll
>>     llvm/trunk/test/CodeGen/X86/sse1.ll
>>
>> Modified: llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp?rev=303084&r1=303083&r2=303084&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp (original)
>> +++ llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp Mon May 15 12:30:47
>> 2017
>> @@ -133,6 +133,14 @@ static cl::opt<unsigned> TailDupPlacemen
>>               "that won't conflict."), cl::init(2),
>>      cl::Hidden);
>>
>> +// Heuristic for aggressive tail duplication.
>> +static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
>> +    "tail-dup-placement-aggressive-threshold",
>> +    cl::desc("Instruction cutoff for aggressive tail duplication during "
>> +             "layout. Used at -O3. Tail merging during layout is forced
>> to "
>> +             "have a threshold that won't conflict."), cl::init(3),
>> +    cl::Hidden);
>> +
>>  // Heuristic for tail duplication.
>>  static cl::opt<unsigned> TailDupPlacementPenalty(
>>      "tail-dup-placement-penalty",
>> @@ -2646,9 +2654,26 @@ bool MachineBlockPlacement::runOnMachine
>>    assert(BlockToChain.empty());
>>    assert(ComputedEdges.empty());
>>
>> +  unsigned TailDupSize = TailDupPlacementThreshold;
>> +  // If only the aggressive threshold is explicitly set, use it.
>> +  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
>> +      TailDupPlacementThreshold.getNumOccurrences() == 0)
>> +    TailDupSize = TailDupPlacementAggressiveThreshold;
>> +
>> +  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
>> +  // For agressive optimization, we can adjust some thresholds to be less
>> +  // conservative.
>> +  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
>> +    // At O3 we should be more willing to copy blocks for tail
>> duplication. This
>> +    // increases size pressure, so we only do it at O3
>> +    // Do this unless only the regular threshold is explicitly set.
>> +    if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
>> +        TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
>> +      TailDupSize = TailDupPlacementAggressiveThreshold;
>> +  }
>> +
>>    if (TailDupPlacement) {
>>      MPDT = &getAnalysis<MachinePostDominatorTree>();
>> -    unsigned TailDupSize = TailDupPlacementThreshold;
>>      if (MF.getFunction()->optForSize())
>>        TailDupSize = 1;
>>      TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
>> @@ -2658,7 +2683,6 @@ bool MachineBlockPlacement::runOnMachine
>>    buildCFGChains();
>>
>>    // Changing the layout can create new tail merging opportunities.
>> -  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
>>    // TailMerge can create jump into if branches that make CFG
>> irreducible for
>>    // HW that requires structured CFG.
>>    bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
>> @@ -2666,7 +2690,7 @@ bool MachineBlockPlacement::runOnMachine
>>                           BranchFoldPlacement;
>>    // No tail merging opportunities if the block number is less than four.
>>    if (MF.size() > 3 && EnableTailMerge) {
>> -    unsigned TailMergeSize = TailDupPlacementThreshold + 1;
>> +    unsigned TailMergeSize = TailDupSize + 1;
>>      BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false,
>> *MBFI,
>>                      *MBPI, TailMergeSize);
>>
>>
>> Modified: llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll?rev=303084&r1=303083&r2=303084&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll (original)
>> +++ llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll Mon May 15
>> 12:30:47 2017
>> @@ -1,4 +1,5 @@
>> -; RUN: llc -O2 < %s | FileCheck %s
>> +; RUN: llc -O2 -o - %s | FileCheck --check-prefix=CHECK
>> --check-prefix=CHECK-O2 %s
>> +; RUN: llc -O3 -o - %s | FileCheck --check-prefix=CHECK
>> --check-prefix=CHECK-O3 %s
>>  target datalayout = "e-m:e-i64:64-n32:64"
>>  target triple = "powerpc64le-grtev4-linux-gnu"
>>
>> @@ -99,11 +100,9 @@ exit:
>>  ; test1
>>  ; test2
>>  ; test3
>> -; test4
>>  ; optional1
>>  ; optional2
>>  ; optional3
>> -; optional4
>>  ; exit
>>  ; even for 50/50 branches.
>>  ; Tail duplication puts test n+1 at the end of optional n
>> @@ -157,6 +156,98 @@ test3:
>>    br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1
>>  optional3:
>>    call void @c()
>> +  br label %exit
>> +exit:
>> +  ret void
>> +}
>> +
>> +; Intended layout:
>> +; The chain-of-triangles based duplicating produces the layout when 3
>> +; instructions are allowed for tail-duplication.
>> +; test1
>> +; test2
>> +; test3
>> +; optional1
>> +; optional2
>> +; optional3
>> +; exit
>> +;
>> +; Otherwise it produces the layout:
>> +; test1
>> +; optional1
>> +; test2
>> +; optional2
>> +; test3
>> +; optional3
>> +; exit
>> +
>> +;CHECK-LABEL: straight_test_3_instr_test:
>> +; test1 may have been merged with entry
>> +;CHECK: mr [[TAGREG:[0-9]+]], 3
>> +;CHECK: clrlwi {{[0-9]+}}, [[TAGREG]], 30
>> +;CHECK-NEXT: cmplwi {{[0-9]+}}, 2
>> +
>> +;CHECK-O3-NEXT: bne 0, .[[OPT1LABEL:[_0-9A-Za-z]+]]
>> +;CHECK-O3-NEXT: # %test2
>> +;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
>> +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8
>> +;CHECK-O3-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
>> +;CHECK-O3-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
>> +;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
>> +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32
>> +;CHECK-O3-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
>> +;CHECK-O3-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
>> +;CHECK-O3: blr
>> +;CHECK-O3-NEXT: .[[OPT1LABEL]]:
>> +;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
>> +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8
>> +;CHECK-O3-NEXT: beq 0, .[[TEST3LABEL]]
>> +;CHECK-O3-NEXT: .[[OPT2LABEL]]:
>> +;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
>> +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32
>> +;CHECK-O3-NEXT: beq 0, .[[EXITLABEL]]
>> +;CHECK-O3-NEXT: .[[OPT3LABEL]]:
>> +;CHECK-O3: b .[[EXITLABEL]]
>> +
>> +;CHECK-O2-NEXT: beq 0, .[[TEST2LABEL:[_0-9A-Za-z]+]]
>> +;CHECK-O2-NEXT: # %optional1
>> +;CHECK-O2: .[[TEST2LABEL]]: # %test2
>> +;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
>> +;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 8
>> +;CHECK-O2-NEXT: beq 0, .[[TEST3LABEL:[_0-9A-Za-z]+]]
>> +;CHECK-O2-NEXT: # %optional2
>> +;CHECK-O2: .[[TEST3LABEL]]: # %test3
>> +;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
>> +;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 32
>> +;CHECK-O2-NEXT: beq 0, .[[EXITLABEL:[_0-9A-Za-z]+]]
>> +;CHECK-O2-NEXT: # %optional3
>> +;CHECK-O2: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
>> +;CHECK-O2: blr
>> +
>> +
>> +define void @straight_test_3_instr_test(i32 %tag) {
>> +entry:
>> +  br label %test1
>> +test1:
>> +  %tagbit1 = and i32 %tag, 3
>> +  %tagbit1eq0 = icmp eq i32 %tagbit1, 2
>> +  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2
>> +optional1:
>> +  call void @a()
>> +  br label %test2
>> +test2:
>> +  %tagbit2 = and i32 %tag, 12
>> +  %tagbit2eq0 = icmp eq i32 %tagbit2, 8
>> +  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2
>> +optional2:
>> +  call void @b()
>> +  br label %test3
>> +test3:
>> +  %tagbit3 = and i32 %tag, 48
>> +  %tagbit3eq0 = icmp eq i32 %tagbit3, 32
>> +  br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1
>> +optional3:
>> +  call void @c()
>>    br label %exit
>>  exit:
>>    ret void
>>
>> Modified: llvm/trunk/test/CodeGen/X86/sse1.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse1.ll?rev=303084&r1=303083&r2=303084&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/sse1.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/sse1.ll Mon May 15 12:30:47 2017
>> @@ -66,7 +66,10 @@ define <4 x float> @vselect(<4 x float>*
>>  ; X32-NEXT:    jne .LBB1_8
>>  ; X32-NEXT:  .LBB1_7:
>>  ; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
>> -; X32-NEXT:    jmp .LBB1_9
>> +; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
>> +; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
>> +; X32-NEXT:    je .LBB1_10
>> +; X32-NEXT:    jmp .LBB1_11
>>  ; X32-NEXT:  .LBB1_1:
>>  ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
>>  ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
>> @@ -77,11 +80,10 @@ define <4 x float> @vselect(<4 x float>*
>>  ; X32-NEXT:    je .LBB1_7
>>  ; X32-NEXT:  .LBB1_8: # %entry
>>  ; X32-NEXT:    xorps %xmm3, %xmm3
>> -; X32-NEXT:  .LBB1_9: # %entry
>>  ; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
>>  ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
>>  ; X32-NEXT:    jne .LBB1_11
>> -; X32-NEXT:  # BB#10:
>> +; X32-NEXT:  .LBB1_10:
>>  ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
>>  ; X32-NEXT:  .LBB1_11: # %entry
>>  ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
>> @@ -103,7 +105,10 @@ define <4 x float> @vselect(<4 x float>*
>>  ; X64-NEXT:    jne .LBB1_8
>>  ; X64-NEXT:  .LBB1_7:
>>  ; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
>> -; X64-NEXT:    jmp .LBB1_9
>> +; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
>> +; X64-NEXT:    testl %esi, %esi
>> +; X64-NEXT:    je .LBB1_10
>> +; X64-NEXT:    jmp .LBB1_11
>>  ; X64-NEXT:  .LBB1_1:
>>  ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
>>  ; X64-NEXT:    testl %edx, %edx
>> @@ -114,11 +119,10 @@ define <4 x float> @vselect(<4 x float>*
>>  ; X64-NEXT:    je .LBB1_7
>>  ; X64-NEXT:  .LBB1_8: # %entry
>>  ; X64-NEXT:    xorps %xmm3, %xmm3
>> -; X64-NEXT:  .LBB1_9: # %entry
>>  ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
>>  ; X64-NEXT:    testl %esi, %esi
>>  ; X64-NEXT:    jne .LBB1_11
>> -; X64-NEXT:  # BB#10:
>> +; X64-NEXT:  .LBB1_10:
>>  ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
>>  ; X64-NEXT:  .LBB1_11: # %entry
>>  ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170516/05f39166/attachment-0001.html>


More information about the llvm-commits mailing list