<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Mon, Jan 27, 2014 at 5:11 AM, Chandler Carruth <span dir="ltr"><<a href="mailto:chandlerc@gmail.com" target="_blank">chandlerc@gmail.com</a>></span> wrote:<br>

<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Author: chandlerc<br>

Date: Mon Jan 27 07:11:50 2014<br>

New Revision: 200219<br>

<br>

URL: <a href="http://llvm.org/viewvc/llvm-project?rev=200219&view=rev" target="_blank">http://llvm.org/viewvc/llvm-project?rev=200219&view=rev</a><br>

Log:<br>

[vectorize] Initial version of respecting PGO in the vectorizer: treat<br>

cold loops as-if they were being optimized for size.<br>

<br>

Nothing fancy here. Simply test case included. The nice thing is that we<br>

can now incrementally build on top of this to drive other heuristics.<br>

All of the infrastructure work is done to get the profile information<br>

into this layer.<br>

<br>

The remaining work necessary to make this a fully general purpose loop<br>

unroller for very hot loops is to make it a fully general purpose loop<br>

unroller. Things I know of but am not going to have time to benchmark<br>

and fix in the immediate future:<br>

<br>

1) Don't disable the entire pass when the target is lacking vector<br>

   registers. This really doesn't make any sense any more.<br>

2) Teach the unroller at least and the vectorizer potentially to handle<br>

   non-if-converted loops. This is trivial for the unroller but hard for<br>

   the vectorizer.<br>

3) Compute the relative hotness of the loop and thread that down to the<br>

   various places that make cost tradeoffs (very likely only the<br>

   unroller makes sense here, and then only when dealing with loops that<br>

   are small enough for unrolling to not completely blow out the LSD).<br>

<br>

I'm still dubious how useful hotness information will be. So far, my<br>

experiments show that if we can get the correct logic for determining<br>

when unrolling actually helps performance, the code size impact is<br>

completely unimportant and we can unroll in all cases. But at least<br>

we'll no longer burn code size on cold code.<br>

<br>

One somewhat unrelated idea that I've had forever but not had time to<br>

implement: mark all functions which are only reachable via the global<br>

constructors rigging in the module as optsize.</blockquote><div><br></div><div>Just idle curiosity - but wouldn't that be likely to hurt startup time which seems important to some people/programs?</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

 This would also decrease<br>

the impact of any more aggressive heuristics here on code size.<br>

<br>

Modified:<br>

    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp<br>

    llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll<br>

<br>

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=200219&r1=200218&r2=200219&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=200219&r1=200218&r2=200219&view=diff</a><br>


==============================================================================<br>

--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)<br>

+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Mon Jan 27 07:11:50 2014<br>

@@ -56,6 +56,7 @@<br>

 #include "llvm/ADT/SmallVector.h"<br>

 #include "llvm/ADT/StringExtras.h"<br>

 #include "llvm/Analysis/AliasAnalysis.h"<br>

+#include "llvm/Analysis/BlockFrequencyInfo.h"<br>

 #include "llvm/Analysis/LoopInfo.h"<br>

 #include "llvm/Analysis/LoopIterator.h"<br>

 #include "llvm/Analysis/LoopPass.h"<br>

@@ -78,6 +79,7 @@<br>

 #include "llvm/IR/Value.h"<br>

 #include "llvm/IR/Verifier.h"<br>

 #include "llvm/Pass.h"<br>

+#include "llvm/Support/BranchProbability.h"<br>

 #include "llvm/Support/CommandLine.h"<br>

 #include "llvm/Support/Debug.h"<br>

 #include "llvm/Support/PatternMatch.h"<br>

@@ -980,18 +982,27 @@ struct LoopVectorize : public FunctionPa<br>

   LoopInfo *LI;<br>

   TargetTransformInfo *TTI;<br>

   DominatorTree *DT;<br>

+  BlockFrequencyInfo *BFI;<br>

   TargetLibraryInfo *TLI;<br>

   bool DisableUnrolling;<br>

   bool AlwaysVectorize;<br>

<br>

+  BlockFrequency ColdEntryFreq;<br>

+<br>

   virtual bool runOnFunction(Function &F) {<br>

     SE = &getAnalysis<ScalarEvolution>();<br>

     DL = getAnalysisIfAvailable<DataLayout>();<br>

     LI = &getAnalysis<LoopInfo>();<br>

     TTI = &getAnalysis<TargetTransformInfo>();<br>

     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();<br>

+    BFI = &getAnalysis<BlockFrequencyInfo>();<br>

     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();<br>

<br>

+    // Compute some weights outside of the loop over the loops. Compute this<br>

+    // using a BranchProbability to re-use its scaling math.<br>

+    const BranchProbability ColdProb(1, 5); // 20%<br>

+    ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;<br>

+<br>

     // If the target claims to have no vector registers don't attempt<br>

     // vectorization.<br>

     if (!TTI->getNumberOfRegisters(true))<br>

@@ -1064,6 +1075,13 @@ struct LoopVectorize : public FunctionPa<br>

     bool OptForSize =<br>

         Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize);<br>

<br>

+    // Compute the weighted frequency of this loop being executed and see if it<br>

+    // is less than 20% of the function entry baseline frequency. Note that we<br>

+    // always have a canonical loop here because we think we *can* vectoriez.<br>

+    BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());<br>

+    if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq)<br>

+      OptForSize = true;<br>

+<br>

     // Check the function attributes to see if implicit floats are allowed.a<br>

     // FIXME: This check doesn't seem possibly correct -- what if the loop is<br>

     // an integer loop and the vector instructions selected are purely integer<br>

@@ -1109,6 +1127,7 @@ struct LoopVectorize : public FunctionPa<br>

   virtual void getAnalysisUsage(AnalysisUsage &AU) const {<br>

     AU.addRequiredID(LoopSimplifyID);<br>

     AU.addRequiredID(LCSSAID);<br>

+    AU.addRequired<BlockFrequencyInfo>();<br>

     AU.addRequired<DominatorTreeWrapperPass>();<br>

     AU.addRequired<LoopInfo>();<br>

     AU.addRequired<ScalarEvolution>();<br>

@@ -5469,6 +5488,7 @@ char LoopVectorize::ID = 0;<br>

 static const char lv_name[] = "Loop Vectorization";<br>

 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)<br>

 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)<br>

+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)<br>

 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)<br>

 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)<br>

 INITIALIZE_PASS_DEPENDENCY(LCSSA)<br>

<br>

Modified: llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll?rev=200219&r1=200218&r2=200219&view=diff" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll?rev=200219&r1=200218&r2=200219&view=diff</a><br>


==============================================================================<br>

--- llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll (original)<br>

+++ llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll Mon Jan 27 07:11:50 2014<br>

@@ -115,6 +115,31 @@ define void @example3(i32 %n, i32* noali<br>

   ret void<br>

 }<br>

<br>

+; N is unknown, we need a tail. Can't vectorize because the loop is cold.<br>

+;CHECK-LABEL: @example4(<br>

+;CHECK-NOT: <4 x i32><br>

+;CHECK: ret void<br>

+define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) {<br>

+  %1 = icmp eq i32 %n, 0<br>

+  br i1 %1, label %._crit_edge, label %.<a href="http://lr.ph" target="_blank">lr.ph</a>, !prof !0<br>

+<br>

+.<a href="http://lr.ph" target="_blank">lr.ph</a>:                                           ; preds = %0, %.<a href="http://lr.ph" target="_blank">lr.ph</a><br>

+  %.05 = phi i32 [ %2, %.<a href="http://lr.ph" target="_blank">lr.ph</a> ], [ %n, %0 ]<br>

+  %.014 = phi i32* [ %5, %.<a href="http://lr.ph" target="_blank">lr.ph</a> ], [ %p, %0 ]<br>

+  %.023 = phi i32* [ %3, %.<a href="http://lr.ph" target="_blank">lr.ph</a> ], [ %q, %0 ]<br>

+  %2 = add nsw i32 %.05, -1<br>

+  %3 = getelementptr inbounds i32* %.023, i64 1<br>

+  %4 = load i32* %.023, align 16<br>

+  %5 = getelementptr inbounds i32* %.014, i64 1<br>

+  store i32 %4, i32* %.014, align 16<br>

+  %6 = icmp eq i32 %2, 0<br>

+  br i1 %6, label %._crit_edge, label %.<a href="http://lr.ph" target="_blank">lr.ph</a><br>

+<br>

+._crit_edge:                                      ; preds = %.<a href="http://lr.ph" target="_blank">lr.ph</a>, %0<br>

+  ret void<br>

+}<br>

+<br>

+!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}<br>

<br>

 ; We can't vectorize this one because we need a runtime ptr check.<br>

 ;CHECK-LABEL: @example23(<br>

<br>

<br>

_______________________________________________<br>

llvm-commits mailing list<br>

<a href="mailto:llvm-commits@cs.uiuc.edu">llvm-commits@cs.uiuc.edu</a><br>

<a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits" target="_blank">http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits</a><br>

</blockquote></div><br></div></div>