<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=Windows-1252">
<meta name="Generator" content="Microsoft Word 15 (filtered medium)">
<style><!--
/* Font Definitions */
@font-face
{font-family:Wingdings;
panose-1:5 0 0 0 0 0 0 0 0 0;}
@font-face
{font-family:"Cambria Math";
panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
{font-family:DengXian;
panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
{font-family:"\@DengXian";
panose-1:2 1 6 0 3 1 1 1 1 1;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0in;
font-size:11.0pt;
font-family:"Calibri",sans-serif;}
p.MsoListParagraph, li.MsoListParagraph, div.MsoListParagraph
{mso-style-priority:34;
margin-top:0in;
margin-right:0in;
margin-bottom:0in;
margin-left:.5in;
font-size:11.0pt;
font-family:"Calibri",sans-serif;}
span.EmailStyle20
{mso-style-type:personal-reply;
font-family:"Calibri",sans-serif;
color:windowtext;}
.MsoChpDefault
{mso-style-type:export-only;
font-size:10.0pt;}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
/* List Definitions */
@list l0
{mso-list-id:547684869;
mso-list-template-ids:1600447082;}
@list l0:level1
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l0:level2
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:1.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l0:level3
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:1.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l0:level4
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:2.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l0:level5
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:2.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l0:level6
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:3.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l0:level7
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:3.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l0:level8
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:4.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l0:level9
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:4.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1
{mso-list-id:715668360;
mso-list-template-ids:26913070;}
@list l1:level1
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level2
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:1.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level3
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:1.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level4
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:2.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level5
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:2.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level6
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:3.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level7
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:3.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level8
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:4.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level9
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:4.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l2
{mso-list-id:1053846708;
mso-list-template-ids:470329506;}
@list l2:level1
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l2:level2
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:1.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l2:level3
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:1.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l2:level4
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:2.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l2:level5
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:2.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l2:level6
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:3.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l2:level7
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:3.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l2:level8
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:4.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l2:level9
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:4.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l3
{mso-list-id:1879584737;
mso-list-type:hybrid;
mso-list-template-ids:-321641034 159967226 67698691 67698693 67698689 67698691 67698693 67698689 67698691 67698693;}
@list l3:level1
{mso-level-start-at:0;
mso-level-number-format:bullet;
mso-level-text:-;
mso-level-tab-stop:none;
mso-level-number-position:left;
text-indent:-.25in;
font-family:"Calibri",sans-serif;
mso-fareast-font-family:DengXian;}
@list l3:level2
{mso-level-number-format:bullet;
mso-level-text:o;
mso-level-tab-stop:none;
mso-level-number-position:left;
text-indent:-.25in;
font-family:"Courier New";}
@list l3:level3
{mso-level-number-format:bullet;
mso-level-text:\F0A7 ;
mso-level-tab-stop:none;
mso-level-number-position:left;
text-indent:-.25in;
font-family:Wingdings;}
@list l3:level4
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:none;
mso-level-number-position:left;
text-indent:-.25in;
font-family:Symbol;}
@list l3:level5
{mso-level-number-format:bullet;
mso-level-text:o;
mso-level-tab-stop:none;
mso-level-number-position:left;
text-indent:-.25in;
font-family:"Courier New";}
@list l3:level6
{mso-level-number-format:bullet;
mso-level-text:\F0A7 ;
mso-level-tab-stop:none;
mso-level-number-position:left;
text-indent:-.25in;
font-family:Wingdings;}
@list l3:level7
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:none;
mso-level-number-position:left;
text-indent:-.25in;
font-family:Symbol;}
@list l3:level8
{mso-level-number-format:bullet;
mso-level-text:o;
mso-level-tab-stop:none;
mso-level-number-position:left;
text-indent:-.25in;
font-family:"Courier New";}
@list l3:level9
{mso-level-number-format:bullet;
mso-level-text:\F0A7 ;
mso-level-tab-stop:none;
mso-level-number-position:left;
text-indent:-.25in;
font-family:Wingdings;}
@list l4
{mso-list-id:2037269312;
mso-list-template-ids:-2115973262;}
@list l4:level1
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level2
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:1.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level3
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:1.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level4
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:2.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level5
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:2.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level6
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:3.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level7
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:3.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level8
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:4.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level9
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:4.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l5
{mso-list-id:2055427459;
mso-list-template-ids:2025908728;}
@list l5:level1
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l5:level2
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:1.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l5:level3
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:1.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l5:level4
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:2.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l5:level5
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:2.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l5:level6
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:3.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l5:level7
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:3.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l5:level8
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:4.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l5:level9
{mso-level-number-format:bullet;
mso-level-text:\F0B7 ;
mso-level-tab-stop:4.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
ol
{margin-bottom:0in;}
ul
{margin-bottom:0in;}
--></style>
</head>
<body lang="EN-US" link="#0563C1" vlink="#954F72" style="word-wrap:break-word">
<div class="WordSection1">
<p class="MsoNormal">Hi Rong,<o:p></o:p></p>
<p class="MsoNormal"> <o:p></o:p></p>
<p class="MsoNormal">Nice to see this proposal which is quite interesting and useful. Actually I believe it can help address some of the performance-critical issues we have encountered with our AutoFDO workloads. As mentioned previously by Wenlei, we have
been dealing with profile counts degradation caused by various transformations, such as loop rotate, simplifyCFG, etc. Relying on BPI/BFI to infer a reasonable counts distribution for duplicated code is sometimes challenging. A per-pass profile will make it
easier. <o:p></o:p></p>
<p class="MsoNormal"> <o:p></o:p></p>
<p class="MsoNormal">One thing I don't quite get is how a flow-sensitive profile applied to a specific optimization pass. If I understand correctly, the records of duplicated instructions collected for an optimization pass from the training build are mapped
to the corresponding instructions resulted by the same pass in the optimized build. Can you explain a bit more about how the mapping works and how code duplication is tracked? Could a slightly different IR duplication cause incorrect mapping?<o:p></o:p></p>
<p class="MsoNormal"> <o:p></o:p></p>
<p class="MsoNormal">It looks like the ID of a pass is considered when adding discriminators for that pass. I guess we are not aiming at supporting every pass with a FS profile, given the size limitation of a 32-bit Dwarf discriminator. What are the first-class
passes being supported? Will it be considered to extend the discriminator size for new optimizations?<o:p></o:p></p>
<p class="MsoNormal"> <o:p></o:p></p>
<p class="MsoNormal">Thanks,<o:p></o:p></p>
<p class="MsoNormal">Hongtao<o:p></o:p></p>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal"><o:p> </o:p></p>
<div style="border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in">
<p class="MsoNormal" style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:12.0pt;margin-left:.5in">
<b><span style="font-size:12.0pt;color:black">From: </span></b><span style="font-size:12.0pt;color:black">Wenlei He <wenlei@fb.com><br>
<b>Date: </b>Wednesday, November 18, 2020 at 4:34 PM<br>
<b>To: </b>Rong Xu <xur@google.com>, llvm-dev <llvm-dev@lists.llvm.org><br>
<b>Cc: </b>David Li <davidxl@google.com>, Hongtao Yu <hoy@fb.com><br>
<b>Subject: </b>Re: [llvm-dev] [RFC] Control Flow Sensitive AutoFDO (FS-AFDO)<o:p></o:p></span></p>
</div>
<p class="MsoNormal" style="margin-left:.5in">Hi Rong,<o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in">This is a very interesting proposal. We've also observed profile quality degradation from CFG destructive pass like loop rotate, and I can see how this proposal would help improve quality of profile that drives
later optimization passes in the pipeline. I have a few questions.<o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p class="MsoListParagraph" style="margin-left:1.0in;text-indent:-.25in;mso-list:l3 level1 lfo3">
<![if !supportLists]><span style="mso-list:Ignore">-<span style="font:7.0pt "Times New Roman"">
</span></span><![endif]>How does this affect today's AutoFDO? Specifically, can users upgrade compiler with FS-AutoFDO support but without refreshing their profile? I think it's important to make new improvement like this as opt-in, so other users of AutoFDO
can choose if and when they want to make the switch from AutoFDO to FS-AutoFDO. With the proposed changes to discriminator encoding, sounds like we are going to eliminate duplication factor etc. altogether. In that case, multiple FS-AutoFDO sample profile
loading would be required in order to not regress from today's AutoFDO due to lack of duplication factor. Is that correct? If so, this change as is can break backward compatibility with today's AutoFDO profile. It’d be great to handle discriminator in a way
that compatible with today’s AutoFDO. <o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p class="MsoListParagraph" style="margin-left:1.0in;text-indent:-.25in;mso-list:l3 level1 lfo3">
<![if !supportLists]><span style="mso-list:Ignore">-<span style="font:7.0pt "Times New Roman"">
</span></span><![endif]>Profile loading usually happens early in the pipeline in order to avoid mismatch between profiling and optimization. In the case of FS-AutoFDO, some mismatch may be inevitable for the later FS profile loading. In practice, how significant
is the mismatch in later profile loading? Have you tried to see by how much using multiple iterations (like CSPGO) can further help profile quality and improve performance? I understand this is not practical for production use, but could give us data points
as to what's left on the table due to mismatch of later FS profile loading.<o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p class="MsoListParagraph" style="margin-left:1.0in;text-indent:-.25in;mso-list:l3 level1 lfo3">
<![if !supportLists]><span style="mso-list:Ignore">-<span style="font:7.0pt "Times New Roman"">
</span></span><![endif]>In your performance experiment, where is the extra FSProfileSampleLoader in the pipeline, right before machine block placement or somewhere else? Have you tried adding more than one FSProfileSampleLoader and is there extra perf gain
for more than one FSProfileSampleLoader?<o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p class="MsoListParagraph" style="margin-left:1.0in;text-indent:-.25in;mso-list:l3 level1 lfo3">
<![if !supportLists]><span style="mso-list:Ignore">-<span style="font:7.0pt "Times New Roman"">
</span></span><![endif]>For the final discriminator assignment, is this because we take MAX of samples on addresses from the same location? So if there's any late code duplication after last discriminator assignment, we need a final discriminator assignment
to turn MAX into SUM for earlier FS profile?<o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p class="MsoListParagraph" style="margin-left:1.0in;text-indent:-.25in;mso-list:l3 level1 lfo3">
<![if !supportLists]><span style="mso-list:Ignore">-<span style="font:7.0pt "Times New Roman"">
</span></span><![endif]>While changing discriminator encoding, have you considered encode block Id into discriminator so AutoFDO can be CFG aware? This is something Wei brought up earlier in the context of CSSPGO, and we didn't go that route partly because
it's hard to do it without breaking compatibility.<o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in">This would be complementary to the context-sensitive SPGO we proposed earlier - would be nice to make PGO/FDO both context-sensitive and flow-sensitive. I think the flow-sensitive part could also integrate with
pseudo-probe, essentially we can append FS discriminator later multiple times the same way as you proposed here, except that the "line" part would be "probeId".<o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in">+Hongtao as well.<o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in">Thanks,<o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in">Wenlei<o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<div style="border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in">
<p class="MsoNormal" style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:12.0pt;margin-left:.5in">
<b><span style="font-size:12.0pt;color:black">From: </span></b><span style="font-size:12.0pt;color:black">llvm-dev <llvm-dev-bounces@lists.llvm.org><br>
<b>Date: </b>Tuesday, November 17, 2020 at 9:55 AM<br>
<b>To: </b>llvm-dev <llvm-dev@lists.llvm.org><br>
<b>Cc: </b>David Li <davidxl@google.com><br>
<b>Subject: </b>[llvm-dev] [RFC] Control Flow Sensitive AutoFDO (FS-AFDO)</span><o:p></o:p></p>
</div>
<div>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Hi all,</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Here I include an RFC for control flow sensitive AutoFDO (FS-AFDO). This is a joint work with David Li. Questions and feedback are welcome.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Thanks,</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Rong</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">=============</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">[RFC] Control Flow Sensitive AutoFDO (FS-AFDO)</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">1. Motivation</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">AFDO profile is derived from PMU samples from running an earlier build binary. PMU samples are indexed by the IP addresses. An offline tool uses the debug line number to map PMU samples to the source
line. The accuracy of debug information is critical for AFDO performance. Thanks to the continuous efforts from the community, we have fixed many debug info related issues that affect AFDO.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">One of the remaining areas that affects the profile accuracy is the handling of duplicated code. Aggressive compiler optimizations can change functions’ control flow significantly. The transformation
can duplicate branches in multiple places and new branches that do not exist in the original source code might also be introduced. All these branches, even though originated from the same source line, may have very different behavior -- which is called (control)
flow sensitivity.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Current AFDO handles the duplication in a pretty coarse grain way. When AFDO sees samples with different IP addresses that map to the same source line, it uses a max function to get the maximum number
of samples and attach to that line. From this prospective, current AFDO is not control flow sensitive. Consider the following example:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black"> </span><span style="font-family:"Courier New";color:black">// Original code:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> for()</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> Stmt1; // line 10</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> // After a versioning optimization</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> for() {</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> if (cond1)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> stmt1; // line 10, 100 samples, address: C1</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> else if (cond2)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> stmt1; // line 10, 80 samples, address: C2</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> else</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> stmt1; // line 10, 20 samples, address: C3</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> }</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">In the final binary, stmt1 is cloned 3 times by the versioning optimization. The address C1, gets 100 samples, the address C2 gets 80 samples, and the address C3 gets 20 samples. When converting Perf
samples to AFDO profile, a max function is used for these samples as they have the same source line. In the AFDO profile, we will have a sample entry of <10, 100>, which means we have a sample value of 100 for line 10 of this function. We will not have branch
information for cond1, cond2, as they do not exist in the sample loading phase. The profile count for stmt1 is also off the actual value (200 = 100 +80 + 20) by 50%.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">In this work, we propose a method that maintains more precise AFDO profile counts for the duplicated control flow.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:12.0pt;margin-left:.5in">
<o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">2. FS-AFDO Designs</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">In this work, we don't aim to keep track of the transformations and make the flow sensitivity available across the compilation pipelines. Rather, we identify a set of target optimizations that can potentially
benefit most from a more precise profile. For each of such optimizations, we use a helper pass to recompute the flow sensitive profile just before this pass. When loading this profile, we will recompute the branch probability and hopefully improve the optimization
quality for that target pass.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">The flow sensitivity in the profile is through a hierarchical discriminator scheme that can better preserve flow information in the Perf profile.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">2.1. Current Use of Discriminators</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">The AFDO profile contains a discriminator field whose goal is to differentiate statements with the same line number in the source code (largely from MACRO expansions).</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Discriminators were later expanded to get more accurate profile counts. It is expanded to 3 components which can partially address the issue. The main objective of that work was to get the correct profile
count for the unrolled (or vectorized) BBs. The discriminator is divided into 3 components:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black"> * Base discriminator, assigned by AddDiscriminators</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black"> * Duplication factor: used in loop unroll and loop vectorization (when cloning the loop body). This factor will be multiplied by sample counts in create_llvm_prof tool to get the count value before duplication.</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">* Copy Identifier: reserved by not currently used.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Duplication factor was explicitly assigned in loop unroll and loop vectorization. Instruction cloning will copy the discriminator. All the cloned instruction instances, other than inlining, loop unroll
and vectorization, have the same discriminator. In the create_llvm_prof tool, when assigning the sample count value for <offset_to_func_head, discriminator>, the max sample value among all the clones will be the assigned value.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">2.2 Hierarchical discriminators</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">In FS-AFDO, discriminator assigning happens multiple times -- conceptually, one assignment for each optimization that can benefit from more precise FS profile data (let’s call it a target optimization
pass). The discriminator is divided into multiple sections. Earlier passes have the lower bits and each instance of assigning access the bits in their sections only.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">For example, there could be 3 rounds of discriminator assigning: the base discriminator which corresponds to existing discriminator assigning, discriminator for pass N, and discriminator for pass M.
Let’s assume each round uses 4 bits. If the instruction Inst1 is from BB0 and it’s copied to BB1, it will have a discriminator 0x1. If BB1:Inst1 is copied to BB2:intr2 before Pass N, this instruction will have a new discriminator of 0x11 associated with pass
N. Similarly, each copy of Inst1 before Pass M will have its own discriminators using bit 8 to bit 12.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">When converting Perf samples to the AFDO profile, we will have the sample count for each discriminator. The AFDO profile loader reads all the samples and masks out the bit used by later rounds. For example,
when loading the profile for the base discriminator, it only checks the first 4 bits. All the samples cloned from the same base discriminator will be aggregated together. Profile loader for pass N will only check the first 8 bits of the discriminator and sum
up all the samples with the same discriminator bit 0 to bit 7.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">2.3 Profile Mismatch</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">The above describes the ideal situation for loading the profile -- all the discriminators in the Perf binary match the discriminator in optimized build. In reality, the compilation for the optimized binary,
with a new profile, might perform different transformations (from the compilation of the Perf binary). It can alter the program control flow for the downstream passes, which leads to a discriminator mismatch.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">If the mismatch happens, the profiling information will not be updated by the SampleLoader to the affected BBs. They will inherit the profile information (like branch probabilities) from the previous
round of SampleLoader.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">The FS component in the discriminators basically uses a sequential number, just like in the base discriminator. But to handle potential mismatch, we also encode the total number of new discriminators.
This prevents the misplacement of profile counts to a wrong code clone. There are limited bits for each FS discriminator component. We choose to use a hash value as the FS discriminator. The hash function takes the following as the input:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black"> * the sequential number of the current clone,</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black"> * the total number of the current clone.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">A second measure to deal with mismatch is to use the profile coverage. We summarize the samples used by SampleLoad for each function, and compare the total samples in the profile. A warning is printed
out if the coverage (ratio of used samples and total sample) is lower than a threshold value. In which case, the user should rebuild the Perf binary and collect the Perf profile again.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">2.4 Example of pass pipeline</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Here is the one sample pass pipeline for AFDO + ThinLTO compilation.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">// Pass Pipeline:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">InsertBaseDiscrimator</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">SampleProfileLoader in FE</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">...</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">CM inline</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">...</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">SampleProfileLoader in BE</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">...</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">AddFSDiscriminator(kind_mbp)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">FSProfileLoader(kind_mbp)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">MachineBlockPlacement</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">AddFSDiscriminator(kind_branchfolding)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">FSProfileLoader(kind_branchfolding)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">BranchFoldingPass</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">…</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">AddFSDiscriminator(kind_final)</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">In the above pass pipeline. MachineBlockPlacement and BranchFoldingPass are the two target optimization passes. MachineBlockPlacement can obviously be benefited from more precise profile information.
The tail merge in BranchFoldPass can undo many unneeded tail duplications in MachineBlockPlacment. </span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Note that we have a FS discriminator assigning pass at the final of compilation pipeline. The reason is to assign a different discriminator for each cloned instruction in the final binary. By doing this,
we can get a more precise profile for earlier passes by summing the counter values from different discriminators.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:12.0pt;margin-left:.5in">
<o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">3. A running example</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">The test program is shown as the following. We pay most attention to the branch in line 5 and line 7 in function foo(). Inner loop at line 3 will be fully unrolled after SampleProfileLoader. Outer loop
at line 2 is intact by the optimizer.</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">__attribute__((noinline)) int bar(int i){</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> volatile int j;</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> j = i;</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> return j;</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">}</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">__attribute__((noinline)) void work(int i){</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">...</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">}</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">__attribute__((noinline)) void foo(){ // line 17 of unroll.c</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> int i, j;</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> for (j = 0; j < 48; j++)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> for (i = 0; i < 4; i++) {</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> int ii = bar(i + j * 48);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> if (ii % 2) // line 22 of unroll.c</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> work(ii * 2);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> if (ii % 4) // line 24 of unroll.c</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> work(ii * 3);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> }</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">}</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Current AFDO will produce the following profile:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">foo:30387110:1606 void foo() {</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 0: 1606 int i, j;</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 2.1: 83748 for (j = 0; j < 48; j++) {</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 4: 351836 bar:84932 int ii = bar(i + j * 48);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 5: 351836 if (ii % 2)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 6: 351836 work:79741 work(ii * 2)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 7: 350420 if (ii % 4)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 8: 338340 work:75287 work(ii * 3)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 10: 1302 }</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Function foo() entry sample count is 1606. Line 2.1 is the outer loop body with a sample count of 83748. Line 4 to line 8 is inner loop body count. It’s scaled by an unrolled multiplication discriminator
of 4. This resulted in a higher value of sample count: it uses the Max of 4 clones and multiplies by 4.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Here is the FS AFDO profile. Basically it has a separated profile count for each unrolled instructions.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">foo:8722583:1597 void foo() {</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 0: 1597 int i, j;</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 2.1: 83694 for (j = 0; j < 48; j++)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 4: 87205 bar:85542 int ii = bar(i + j * 48);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 4.8448: 82042 bar:82155 int ii = bar(i + j * 48);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 4.13312: 84795 bar:84802 int ii = bar(i + j * 48);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 4.13568: 87820 bar:87718 int ii = bar(i + j * 48);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 5: 87205 if (ii % 2)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 5.8448: 82042 if (ii % 2)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 5.13312: 79604 if (ii %2)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 5.13568: 87820 if (ii %2)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 6: 0 work(ii * 2);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 6.8448: 0 work(ii * 2);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 6.13312: 79604 work:79652 work(ii * 2);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 6.13568: 87820 work:90500 work(ii * 2);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 7: 87493 if (ii %4)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 7.8448: 84624 if (ii % 4)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 7.13312: 71942 if (ii % 4)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 7.13568: 83552 if (ii % 4)</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 8: 0 work(ii * 3);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 8.8448: 84624 work:87655 work(ii * 3);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 8.13312: 71942 work:75137 work(ii *3);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 8.13568: 83552 work:91223 work(ii *3);</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black"> 10: 1304 }</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Let’s look at line 6 and line 8. Each of which has 4 records. When we use these records in SampleLoader, these 4 records will be combined into one after masking out the bits: line 6 will have one record
of</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">6: 167424 work: 170152</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Line 8 will have one record:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">8: 240118 work: 254015</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Comparing the sample count in line 5 and 7, we can easily see that line 5 has ~50% taken and line 7 has ~75% taken. This is in contrast to the current AFDO profile, where both lines have a 100% taken
branch.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">When we load the above profile before the MachineBlockPlacement pass, each record will maintain its own instance. We can see that at line 5, two of the branches have ~0% taken probability, while the rest
two have ~100% taken probability. For line 7, one branch has 0% taken probability, while the reset three has ~100% taken probability. We will reset the branch probabilities as shown in the following messages:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">Set branch fs prob: MBB (1 -> 3): unroll.c:22:11 W=87820 0x00005f85 / 0x80000000 = 0.00% --> 0x80000000 / 0x80000000 = 100.00%</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">Set branch fs prob: MBB (1 -> 2): unroll.c:22:11 W=87820 0x7fffa07b / 0x80000000 = 100.00% --> 0x00000000 / 0x80000000 = 0.00%</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">Set branch fs prob: MBB (3 -> 5): unroll.c:24:11 W=87820 0x04a8dc00 / 0x80000000 = 3.64% --> 0x80000000 / 0x80000000 = 100.00%</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">Set branch fs prob: MBB (3 -> 4): unroll.c:24:11 W=87820 0x7b572400 / 0x80000000 = 96.36% --> 0x00000000 / 0x80000000 = 0.00%</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">Set branch fs prob: MBB (9 -> 11): unroll.c:22:11 W=87820 0x00005f85 / 0x80000000 = 0.00% --> 0x80000000 / 0x80000000 = 100.00%</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">Set branch fs prob: MBB (9 -> 10): unroll.c:22:11 W=87820 0x7fffa07b / 0x80000000 = 100.00% --> 0x00000000 / 0x80000000 = 0.00%</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">Set branch fs prob: MBB (15 -> 17): unroll.c:24:11 W=87820 0x04a8dc00 / 0x80000000 = 3.64% --> 0x17248215 / 0x80000000 = 18.08%</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Courier New";color:black">Set branch fs prob: MBB (15 -> 16): unroll.c:24:11 W=87820 0x7b572400 / 0x80000000 = 96.36% --> 0x68db7deb / 0x80000000 = 81.92%</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:12.0pt;margin-left:.5in">
<o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">4. Implementation in LLVM</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">4.1 AFDO profile data format</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">The AFDO profile format remains largely the same. Samples could be splitted into multiple parts with different discriminator copy components. SampleLoader will aggregate the samples dynamically when reading
the AFDO profile based on the discriminator masks. Current discriminator encoding will be removed and related code in the compiler will be removed.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">The hierarchical discriminator in FS-AFDO captures most of the cases of multiplication discriminator in currenting encoding scheme. As a matter of fact, the resulting count values are usually more precise
as each individual discriminator may have different count values. There is one exception: when the unrolling body has a single BB. FS-AFDO will have the correct sample count after unrolling as all the instructions residing in the same BB. In our prototype
implementation, we did not find this issue to have a visible performance impact.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">4.2 Compiler changes</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">4.2.1 Add two new CodeGen passes:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">AddFSDiscriminatorsPass: adds flow sensitive discriminators.</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">FSProfileLoaderPass: read FS profile and set branch probabilities.</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">FSProfileLoadPass will reuse most of the IR level SampleProfileLoader algorithm and heuristics, but we need to reimplement at Machine level. We will not need to update branch metadata. We will change
the branch probabilities directly.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">4.2.2 Enhance SampleProfileReader to handle FS discriminators.</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Add a new field of MaskedBitFrom to SampleProfileReader. This is the lower mask bit being used for this pass instance. For each profile loader, we dynamically compute samples based on bit masks -- this
is done in readImpl() method where the masked discriminators are used to aggregate the samples.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">4.3 AutoFDO tool changes</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">The position type used in PositionCountMap and Callsite type is of type uint32. It is divided into high 16 bits as file offset and lower 16 bits as discriminator. This is probably OK for the current discriminators
as they are numbered sequentially. But this would create too many clashes in FS discriminators. We need to extend to uint64 for the position: a full 32-bit for line number and full 32-bit for discriminators.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:12.0pt;margin-left:.5in">
<o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">5. Experimental performance</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">A prototype was implemented and a performance test was done on critical applications in Google.</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Test binary:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">(1) build the first round binary with an existing AFDO profile with AddFSDiscriminator enabled.</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">(2) collect the PERF samples and convert them to the FS AFDO profile.</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">(3) build an optimized binary using the FSAFDO profile from step (2).</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Base binary:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">(1) build the first round binary with an existing AFDO profile</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">(2) collect the PERF samples and convert them to the AFDO profile.</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">(3) build an optimized binary using profile from step (2).</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Run time performance:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">The test binary shows 0.9% to 1.3% performance improvement over the base binary.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Profile size:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Profile size is modestly increased. Using the Google benchmark as the example, the profile size for the text form increased by 4% to 7% while the binary form profile size increased by 5% to 8%.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"> <o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Compiler time:</span><o:p></o:p></p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:0in;margin-left:.5in">
<span style="font-family:"Arial",sans-serif;color:black">Prototype implementation shows about 10% increase of build time (measued by the elapsed time which includes two extra FSAddDiscrimniator passes and one extra FSProfileSampleLoader pass). Profile readings
are not shared b/w FSProfileSampleLoader and current ProfileSampleLoader. We expect a smaller compiler time increase with better data structures that share a profile loader.</span><o:p></o:p></p>
<p class="MsoNormal" style="margin-left:.5in"><br>
<br>
<br>
<br>
<o:p></o:p></p>
</div>
</div>
</body>
</html>