<div dir="ltr"><div>This test is broken after the patch: <a href="http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux-autoconf/builds/8452/steps/tsan%20analyze/logs/stdio">http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux-autoconf/builds/8452/steps/tsan%20analyze/logs/stdio</a><br></div></div><br><div class="gmail_quote"><div dir="ltr">On Mon, May 15, 2017 at 10:44 AM Kyle Butt via llvm-commits <<a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Author: iteratee<br>
Date: Mon May 15 12:30:47 2017<br>
New Revision: 303084<br>
<br>
URL: <a href="http://llvm.org/viewvc/llvm-project?rev=303084&view=rev" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project?rev=303084&view=rev</a><br>
Log:<br>
CodeGen: BlockPlacement: Increase tail duplication size for O3.<br>
<br>
At O3 we are more willing to increase size if we believe it will improve<br>
performance. The current threshold for tail-duplication of 2 instructions is<br>
conservative, and can be relaxed at O3.<br>
<br>
Benchmark results:<br>
llvm test-suite:<br>
6% improvement in aha, due to duplication of loop latch<br>
3% improvement in hexxagon<br>
<br>
2% slowdown in lpbench. Seems related, but couldn't completely diagnose.<br>
<br>
Internal google benchmark:<br>
Produces 4% improvement on internal google protocol buffer serialization<br>
benchmarks.<br>
<br>
Differential-Revision: <a href="https://reviews.llvm.org/D32324" rel="noreferrer" target="_blank">https://reviews.llvm.org/D32324</a><br>
<br>
Modified:<br>
llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp<br>
llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll<br>
llvm/trunk/test/CodeGen/X86/sse1.ll<br>
<br>
Modified: llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp?rev=303084&r1=303083&r2=303084&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp?rev=303084&r1=303083&r2=303084&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp (original)<br>
+++ llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp Mon May 15 12:30:47 2017<br>
@@ -133,6 +133,14 @@ static cl::opt<unsigned> TailDupPlacemen<br>
"that won't conflict."), cl::init(2),<br>
cl::Hidden);<br>
<br>
+// Heuristic for aggressive tail duplication.<br>
+static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(<br>
+ "tail-dup-placement-aggressive-threshold",<br>
+ cl::desc("Instruction cutoff for aggressive tail duplication during "<br>
+ "layout. Used at -O3. Tail merging during layout is forced to "<br>
+ "have a threshold that won't conflict."), cl::init(3),<br>
+ cl::Hidden);<br>
+<br>
// Heuristic for tail duplication.<br>
static cl::opt<unsigned> TailDupPlacementPenalty(<br>
"tail-dup-placement-penalty",<br>
@@ -2646,9 +2654,26 @@ bool MachineBlockPlacement::runOnMachine<br>
assert(BlockToChain.empty());<br>
assert(ComputedEdges.empty());<br>
<br>
+ unsigned TailDupSize = TailDupPlacementThreshold;<br>
+ // If only the aggressive threshold is explicitly set, use it.<br>
+ if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&<br>
+ TailDupPlacementThreshold.getNumOccurrences() == 0)<br>
+ TailDupSize = TailDupPlacementAggressiveThreshold;<br>
+<br>
+ TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();<br>
+ // For agressive optimization, we can adjust some thresholds to be less<br>
+ // conservative.<br>
+ if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {<br>
+ // At O3 we should be more willing to copy blocks for tail duplication. This<br>
+ // increases size pressure, so we only do it at O3<br>
+ // Do this unless only the regular threshold is explicitly set.<br>
+ if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||<br>
+ TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)<br>
+ TailDupSize = TailDupPlacementAggressiveThreshold;<br>
+ }<br>
+<br>
if (TailDupPlacement) {<br>
MPDT = &getAnalysis<MachinePostDominatorTree>();<br>
- unsigned TailDupSize = TailDupPlacementThreshold;<br>
if (MF.getFunction()->optForSize())<br>
TailDupSize = 1;<br>
TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);<br>
@@ -2658,7 +2683,6 @@ bool MachineBlockPlacement::runOnMachine<br>
buildCFGChains();<br>
<br>
// Changing the layout can create new tail merging opportunities.<br>
- TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();<br>
// TailMerge can create jump into if branches that make CFG irreducible for<br>
// HW that requires structured CFG.<br>
bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&<br>
@@ -2666,7 +2690,7 @@ bool MachineBlockPlacement::runOnMachine<br>
BranchFoldPlacement;<br>
// No tail merging opportunities if the block number is less than four.<br>
if (MF.size() > 3 && EnableTailMerge) {<br>
- unsigned TailMergeSize = TailDupPlacementThreshold + 1;<br>
+ unsigned TailMergeSize = TailDupSize + 1;<br>
BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,<br>
*MBPI, TailMergeSize);<br>
<br>
<br>
Modified: llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll?rev=303084&r1=303083&r2=303084&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll?rev=303084&r1=303083&r2=303084&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll (original)<br>
+++ llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll Mon May 15 12:30:47 2017<br>
@@ -1,4 +1,5 @@<br>
-; RUN: llc -O2 < %s | FileCheck %s<br>
+; RUN: llc -O2 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O2 %s<br>
+; RUN: llc -O3 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O3 %s<br>
target datalayout = "e-m:e-i64:64-n32:64"<br>
target triple = "powerpc64le-grtev4-linux-gnu"<br>
<br>
@@ -99,11 +100,9 @@ exit:<br>
; test1<br>
; test2<br>
; test3<br>
-; test4<br>
; optional1<br>
; optional2<br>
; optional3<br>
-; optional4<br>
; exit<br>
; even for 50/50 branches.<br>
; Tail duplication puts test n+1 at the end of optional n<br>
@@ -157,6 +156,98 @@ test3:<br>
br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1<br>
optional3:<br>
call void @c()<br>
+ br label %exit<br>
+exit:<br>
+ ret void<br>
+}<br>
+<br>
+; Intended layout:<br>
+; The chain-of-triangles based duplicating produces the layout when 3<br>
+; instructions are allowed for tail-duplication.<br>
+; test1<br>
+; test2<br>
+; test3<br>
+; optional1<br>
+; optional2<br>
+; optional3<br>
+; exit<br>
+;<br>
+; Otherwise it produces the layout:<br>
+; test1<br>
+; optional1<br>
+; test2<br>
+; optional2<br>
+; test3<br>
+; optional3<br>
+; exit<br>
+<br>
+;CHECK-LABEL: straight_test_3_instr_test:<br>
+; test1 may have been merged with entry<br>
+;CHECK: mr [[TAGREG:[0-9]+]], 3<br>
+;CHECK: clrlwi {{[0-9]+}}, [[TAGREG]], 30<br>
+;CHECK-NEXT: cmplwi {{[0-9]+}}, 2<br>
+<br>
+;CHECK-O3-NEXT: bne 0, .[[OPT1LABEL:[_0-9A-Za-z]+]]<br>
+;CHECK-O3-NEXT: # %test2<br>
+;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29<br>
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8<br>
+;CHECK-O3-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]<br>
+;CHECK-O3-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3<br>
+;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27<br>
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32<br>
+;CHECK-O3-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]<br>
+;CHECK-O3-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit<br>
+;CHECK-O3: blr<br>
+;CHECK-O3-NEXT: .[[OPT1LABEL]]:<br>
+;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29<br>
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8<br>
+;CHECK-O3-NEXT: beq 0, .[[TEST3LABEL]]<br>
+;CHECK-O3-NEXT: .[[OPT2LABEL]]:<br>
+;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27<br>
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32<br>
+;CHECK-O3-NEXT: beq 0, .[[EXITLABEL]]<br>
+;CHECK-O3-NEXT: .[[OPT3LABEL]]:<br>
+;CHECK-O3: b .[[EXITLABEL]]<br>
+<br>
+;CHECK-O2-NEXT: beq 0, .[[TEST2LABEL:[_0-9A-Za-z]+]]<br>
+;CHECK-O2-NEXT: # %optional1<br>
+;CHECK-O2: .[[TEST2LABEL]]: # %test2<br>
+;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29<br>
+;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 8<br>
+;CHECK-O2-NEXT: beq 0, .[[TEST3LABEL:[_0-9A-Za-z]+]]<br>
+;CHECK-O2-NEXT: # %optional2<br>
+;CHECK-O2: .[[TEST3LABEL]]: # %test3<br>
+;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27<br>
+;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 32<br>
+;CHECK-O2-NEXT: beq 0, .[[EXITLABEL:[_0-9A-Za-z]+]]<br>
+;CHECK-O2-NEXT: # %optional3<br>
+;CHECK-O2: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit<br>
+;CHECK-O2: blr<br>
+<br>
+<br>
+define void @straight_test_3_instr_test(i32 %tag) {<br>
+entry:<br>
+ br label %test1<br>
+test1:<br>
+ %tagbit1 = and i32 %tag, 3<br>
+ %tagbit1eq0 = icmp eq i32 %tagbit1, 2<br>
+ br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2<br>
+optional1:<br>
+ call void @a()<br>
+ br label %test2<br>
+test2:<br>
+ %tagbit2 = and i32 %tag, 12<br>
+ %tagbit2eq0 = icmp eq i32 %tagbit2, 8<br>
+ br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2<br>
+optional2:<br>
+ call void @b()<br>
+ br label %test3<br>
+test3:<br>
+ %tagbit3 = and i32 %tag, 48<br>
+ %tagbit3eq0 = icmp eq i32 %tagbit3, 32<br>
+ br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1<br>
+optional3:<br>
+ call void @c()<br>
br label %exit<br>
exit:<br>
ret void<br>
<br>
Modified: llvm/trunk/test/CodeGen/X86/sse1.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse1.ll?rev=303084&r1=303083&r2=303084&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse1.ll?rev=303084&r1=303083&r2=303084&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/test/CodeGen/X86/sse1.ll (original)<br>
+++ llvm/trunk/test/CodeGen/X86/sse1.ll Mon May 15 12:30:47 2017<br>
@@ -66,7 +66,10 @@ define <4 x float> @vselect(<4 x float>*<br>
; X32-NEXT: jne .LBB1_8<br>
; X32-NEXT: .LBB1_7:<br>
; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero<br>
-; X32-NEXT: jmp .LBB1_9<br>
+; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]<br>
+; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)<br>
+; X32-NEXT: je .LBB1_10<br>
+; X32-NEXT: jmp .LBB1_11<br>
; X32-NEXT: .LBB1_1:<br>
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero<br>
; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)<br>
@@ -77,11 +80,10 @@ define <4 x float> @vselect(<4 x float>*<br>
; X32-NEXT: je .LBB1_7<br>
; X32-NEXT: .LBB1_8: # %entry<br>
; X32-NEXT: xorps %xmm3, %xmm3<br>
-; X32-NEXT: .LBB1_9: # %entry<br>
; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]<br>
; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)<br>
; X32-NEXT: jne .LBB1_11<br>
-; X32-NEXT: # BB#10:<br>
+; X32-NEXT: .LBB1_10:<br>
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero<br>
; X32-NEXT: .LBB1_11: # %entry<br>
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]<br>
@@ -103,7 +105,10 @@ define <4 x float> @vselect(<4 x float>*<br>
; X64-NEXT: jne .LBB1_8<br>
; X64-NEXT: .LBB1_7:<br>
; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero<br>
-; X64-NEXT: jmp .LBB1_9<br>
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]<br>
+; X64-NEXT: testl %esi, %esi<br>
+; X64-NEXT: je .LBB1_10<br>
+; X64-NEXT: jmp .LBB1_11<br>
; X64-NEXT: .LBB1_1:<br>
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero<br>
; X64-NEXT: testl %edx, %edx<br>
@@ -114,11 +119,10 @@ define <4 x float> @vselect(<4 x float>*<br>
; X64-NEXT: je .LBB1_7<br>
; X64-NEXT: .LBB1_8: # %entry<br>
; X64-NEXT: xorps %xmm3, %xmm3<br>
-; X64-NEXT: .LBB1_9: # %entry<br>
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]<br>
; X64-NEXT: testl %esi, %esi<br>
; X64-NEXT: jne .LBB1_11<br>
-; X64-NEXT: # BB#10:<br>
+; X64-NEXT: .LBB1_10:<br>
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero<br>
; X64-NEXT: .LBB1_11: # %entry<br>
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]<br>
<br>
<br>
_______________________________________________<br>
llvm-commits mailing list<br>
<a href="mailto:llvm-commits@lists.llvm.org" target="_blank">llvm-commits@lists.llvm.org</a><br>
<a href="http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits</a><br>
</blockquote></div>