<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="Generator" content="Microsoft Word 15 (filtered medium)">
<style><!--
/* Font Definitions */
@font-face
{font-family:"Cambria Math";
panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
{font-family:DengXian;
panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
{font-family:"Apple Color Emoji";
panose-1:0 0 0 0 0 0 0 0 0 0;}
@font-face
{font-family:"\@DengXian";
panose-1:2 1 6 0 3 1 1 1 1 1;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0in;
margin-bottom:.0001pt;
font-size:11.0pt;
font-family:"Calibri",sans-serif;}
a:link, span.MsoHyperlink
{mso-style-priority:99;
color:blue;
text-decoration:underline;}
span.EmailStyle18
{mso-style-type:personal-reply;
font-family:"Calibri",sans-serif;
color:windowtext;}
.MsoChpDefault
{mso-style-type:export-only;
font-size:10.0pt;}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
/* List Definitions */
@list l0
{mso-list-id:178812350;
mso-list-template-ids:-1936661502;}
@list l1
{mso-list-id:782111453;
mso-list-template-ids:-1219882440;}
@list l1:level1
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level2
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:1.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level3
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:1.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level4
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:2.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level5
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:2.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level6
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:3.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level7
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:3.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level8
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:4.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level9
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:4.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l2
{mso-list-id:820773335;
mso-list-template-ids:-1945600956;}
@list l3
{mso-list-id:1069230306;
mso-list-template-ids:727207172;}
@list l4
{mso-list-id:1115363757;
mso-list-template-ids:1402401726;}
@list l4:level1
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level2
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:1.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level3
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:1.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level4
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:2.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level5
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:2.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level6
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:3.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level7
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:3.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level8
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:4.0in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l4:level9
{mso-level-number-format:bullet;
mso-level-text:;
mso-level-tab-stop:4.5in;
mso-level-number-position:left;
text-indent:-.25in;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l5
{mso-list-id:1814371011;
mso-list-template-ids:-1064634426;}
ol
{margin-bottom:0in;}
ul
{margin-bottom:0in;}
--></style>
</head>
<body lang="EN-US" link="blue" vlink="purple">
<div class="WordSection1">
<p class="MsoNormal">See my <span style="color:#4472C4">answers inline</span>.<o:p></o:p></p>
<p class="MsoNormal"><o:p> </o:p></p>
<div style="border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in">
<p class="MsoNormal"><b><span style="font-size:12.0pt;color:black">From: </span></b><span style="font-size:12.0pt;color:black">Xinliang David Li <davidxl@google.com><br>
<b>Date: </b>Friday, August 7, 2020 at 7:57 PM<br>
<b>To: </b>Wenlei He <wenlei@fb.com><br>
<b>Cc: </b>"llvm-dev@lists.llvm.org" <llvm-dev@lists.llvm.org>, Wei Mi <wmi@google.com>, Hongtao Yu <hoy@fb.com><br>
<b>Subject: </b>Re: [RFC] Context-sensitive Sample PGO with Pseudo-Instrumentation<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black"><o:p> </o:p></span></p>
</div>
</div>
<p class="MsoNormal"><o:p> </o:p></p>
<div>
<div>
<p class="MsoNormal">On Fri, Aug 7, 2020 at 4:44 PM Wenlei He <<a href="mailto:wenlei@fb.com">wenlei@fb.com</a>> wrote:<o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-right:0in">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto">Thanks for the thoughtful questions, David. See my
<span style="color:#4472C4">answers inline</span>.<o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto">Thanks,<o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto">Wenlei<o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<div style="border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in">
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><b><span style="font-size:12.0pt;color:black">From:
</span></b><span style="font-size:12.0pt;color:black">Xinliang David Li <<a href="mailto:davidxl@google.com" target="_blank">davidxl@google.com</a>><br>
<b>Date: </b>Friday, August 7, 2020 at 1:24 PM<br>
<b>To: </b>Wenlei He <<a href="mailto:wenlei@fb.com" target="_blank">wenlei@fb.com</a>><br>
<b>Cc: </b>"<a href="mailto:llvm-dev@lists.llvm.org" target="_blank">llvm-dev@lists.llvm.org</a>" <<a href="mailto:llvm-dev@lists.llvm.org" target="_blank">llvm-dev@lists.llvm.org</a>>, Wei Mi <<a href="mailto:wmi@google.com" target="_blank">wmi@google.com</a>>,
Hongtao Yu <<a href="mailto:hoy@fb.com" target="_blank">hoy@fb.com</a>><br>
<b>Subject: </b>Re: [RFC] Context-sensitive Sample PGO with Pseudo-Instrumentation</span><o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">Wenlei, Thanks for the interesting proposal! please see my replies inline below.</span><o:p></o:p></p>
</div>
</div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto">On Fri, Aug 7, 2020 at 11:28 AM Wenlei He <<a href="mailto:wenlei@fb.com" target="_blank">wenlei@fb.com</a>> wrote:<o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:12.0pt"><span style="color:black">Hi All,</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">Our team at Facebook is building a new context-sensitive Sample PGO as an alternative to the existing AutoFDO. We’d like
to share our motivation, propose a new design, and reveal preliminary results on benchmarks. We will refer to the proposed design as CSSPGO in this RFC.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">The new CSSPGO leverages simultaneous LBR and stack sampling to construct a full context-sensitive profile.</span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">Can you share more details on this? LBR only has 32 entries, so it won't give you full call context, so stack
unwinding is needed. What is the overhead you see in production environment?</span><o:p></o:p></p>
</div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] We are not worried about overhead in production environment as the sampling rate there is extremely low. We did measure locally however, for
stack sampling and level 2 PEBS on top of regular LBR sampling, the overheads isn’t very noticeable still, but I don’t have numbers at hand.
</span><o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
</div>
</div>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">I assume this is with no-omit-frame-pointer option right?<o:p></o:p></span></p>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal"><span style="color:#4472C4">[wenlei] Right, and tail call is off too for our experiments, but we may keep it on for production usage later. See my reply to Wei’s question on this.<o:p></o:p></span></p>
</div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<p class="MsoNormal"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-right:0in">
<div>
<div>
<div>
<div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">It doesn’t rely on previous inlining like today’s AutoFDO to get context-sensitive profile, and it also doesn’t need a separate
post-inline context-sensitive profile like CSPGO. </span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">What is the sample profile data size impact with the full context information?</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] Text CS profile is normally around 1x-10x of regular profile size, with all live context included. We plan to trim cold context, which we expect
to bring the size down in a meaningful way. Another source of size increase is the context string, which could contain duplicated mangle names (can be very long for C++ templated code), but should be very compressible with the built-in compression support
from extended binary profile. We will move to extended binary format, and leverage the compression support if needed. We can also consider more efficient fixed-length integer context representation (similar to rolling hash).
</span><o:p></o:p></p>
</div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
</div>
</div>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">What is the average and max number of live contexts you have seen? Statically it grows exponentially as the depth of the context increases.<o:p></o:p></span></p>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal"><span style="color:#4472C4">[wenlei] I guess you meant the ratio of number of live contexts to number of functions? I haven’t looked, but I’d expect profile size ratio to be a good proxy for that.<o:p></o:p></span></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-right:0in">
<div>
<div>
<div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">In addition, we introduced pseudo-instrumentation for more accurate mapping from binary samples back to IR, similar to instrumentation
PGO, but without any measure-able runtime overhead that is usually associated with instrumentation.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">Is CSSPGO inherently dependent upon pseudo-probe or is it orthogonal? I hope that it is the latter :)</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] They’re orthogonal. Context-sensitive SPGO can work without pseudo-probe and still use dwarf. Our plan is to keep context-sensitive SPGO working
w/ and w/o pseudo-probe functionality-wise, but we only look at perf and tune with the two combined.
</span><o:p></o:p></p>
</div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
</div>
</div>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">great.<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black"><o:p> </o:p></span></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-right:0in">
<div>
<div>
<div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">We have a functioning implementation for the new CSSPGO now. Initial results on SPEC2006 shows ~2% geomean performance win
on top of AutoFDO (with MonoLTO and NewPM) and ~4% .text size reduction at the same time.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:4.0pt"><b><span style="font-size:17.0pt;font-family:"Arial",sans-serif;color:black">Motivation</span></b><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:#1C1E21">AutoFDO is a big success as it lowers the entry barrier for PGO significantly while still delivering substantial performance
boost. However, there’s still a gap between AutoFDO and its instrumentation counterpart. From several failed internal attempts to improve AutoFDO, we realized that the bottleneck of AutoFDO lies in its profile quality. With the current level of profile quality,
it’s difficult to reap more performance win because good heuristics are often limited by inferior profile. That prompted a systemic effort to investigate and improve AutoFDO framework. Specifically, we’re trying to handle the two biggest sources of profile
quality issues:</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<ol style="margin-top:0in" start="1" type="1">
<li class="MsoNormal" style="color:black;margin-top:12.0pt;mso-margin-bottom-alt:auto;mso-list:l5 level1 lfo1;vertical-align:baseline">
<span style="font-family:"Arial",sans-serif">AutoFDO relies on a limited context-sensitive profile collected based on previous inlining. Therefore it can only replay or prune the previous inlining. With the main CGSCC inliner, post-inline counts are not accurate
due to scaling of context-less profile, which affects the effectiveness of later passes such as profile-guided code layout.</span><o:p></o:p></li></ol>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">Acknowledge of the limitation here.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<ol style="margin-top:0in" start="1" type="1">
<li class="MsoNormal" style="color:black;margin-top:12.0pt;mso-margin-bottom-alt:auto;mso-list:l0 level1 lfo2;vertical-align:baseline">
<o:p></o:p></li><li class="MsoNormal" style="color:black;mso-margin-top-alt:auto;margin-bottom:12.0pt;mso-list:l0 level1 lfo2;vertical-align:baseline">
<span style="font-family:"Arial",sans-serif">Dwarf line and discriminator info aren’t always well-maintained throughout the compilation, thus using them as anchors to map binary samples back to the IR can sometimes be inaccurate, which leads to inferior profile
quality and limits PGO performance.</span><o:p></o:p></li></ol>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">I think we need more quantification of the impact of using debug information for matching purposes: How much
performance are left on the table due to this, and are they fixable issues or not.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] The first table in the result section is comparing pseudo-probe with AutoFDO and Instr. PGO, all with inlining turned off. So that’s a quantitative
assessment of the effectiveness of pseudo-probe. It’s hard to assess performance benefit though, because PGO performance is a function of profile quality and heuristic. Currently heuristics are tuned to cope with the profile quality we have, so it may not
do justice for profile quality improvements that pseudo-probe brings us. </span>
<o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">One example is how FDO inliner evaluates call site. It checks callee’s total sample count instead of callee’s entry count. This is less than ideal, but
we couldn’t fix it due to profile quality issues – we can’t reliably get inlinee’s entry count with dwarf based approach, see discussion in
<a href="https://urldefense.proofpoint.com/v2/url?u=https-3A__reviews.llvm.org_D60086&d=DwMFaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=KfYo542rDdZQGClmgz-RBw&m=IiraiO5pLd86sJtoupX-V4fgITYAQHvv2GN-H_UmDXQ&s=TVgYwUBqNvzMAOEwn2FDgcKlvRrsbAvEXT4OscZS2n4&e=" target="_blank">
<span style="color:#4472C4">https://reviews.llvm.org/D60086</span></a>. That problem is solved with pseudo-probe, but until we change the inliner, we won’t see perf win from that particular profile quality improvement. There are other similar cases too, and
that’s why we used profile quality metric instead of performance to assess pseudo-probe.</span><o:p></o:p></p>
</div>
</div>
</div>
</div>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">Can you change the inliner to use entry count when probe based profile is used?<o:p></o:p></span></p>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal"><span style="color:#4472C4">[wenlei] Yes, we already made that change, and that’s one of the “</span><span style="font-family:"Arial",sans-serif;color:#4472C4">few other improvements for the FDO inliner</span><span style="color:#4472C4">”
I mentioned in the RFC. This is one example of the coupling between heuristic and profile quality.<o:p></o:p></span></p>
</div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<p class="MsoNormal"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-right:0in">
<div>
<div>
<div>
<div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">Some of the issues may be fixable with dwarf info maintenance, but the engineering cost to find and fix all issues are non-trivial. We think maintaining
anchor as IR is a more sustainable alternative than maintaining anchor as metadata.
</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<ol start="1" type="1">
<li class="MsoNormal" style="color:black;mso-margin-top-alt:auto;margin-bottom:12.0pt;mso-list:l3 level1 lfo3;vertical-align:baseline">
<o:p></o:p></li></ol>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">To lift the above limitations, we’d like to propose an alternative design that consists of two components: 1) Context-sensitive
sample PGO, 2) Sample to IR mapping using pseudo probes. The goal is to further improve sample PGO performance while maintaining usability and keeping training runtime overhead at zero. In addition, we hope the CSSPGO framework can also open up opportunities
for new optimizations with more stringent requirements on profile quality. </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">CSSPGO is a very attractive optimization by itself. Can you provide more motivation for the pseudo probes?</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] One way to look at the combination of pseudo-probe and context-sensitive sample PGO is that, the former brings sample PGO closer to instrumentation
PGO, and the latter to sample PGO is like the two-stage CSPGO, or even post-link optimizer to instrumentation PGO. These are two orthogonal problems that needs separate solutions.</span><o:p></o:p></p>
</div>
</div>
</div>
</div>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">There are also differences though:<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black"><o:p> </o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">1) CSPGO has lots of flow sensitivity and PLO has even more flow sensitivity while CSSPGO does not. CSSPGO has the advantage to guide inliner as well<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New""><o:p> </o:p></span></p>
<p class="MsoNormal"><span style="color:#4472C4">[wenlei] Fair point. Though I’m wondering how much perf win does flow sensitivity bring to PGO? Curious if you have data for this. My guess is context sensitivity is much more visible than flow sensitivity for
PGO’s effectiveness. <o:p></o:p></span></p>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">2) Pseudo-probes are inserted pretty early in the pipeline, so it won't beat instrumentation PGO performance as the latter has early inlining to expose some CS. In other
words, Pseudo-probe depends on CSSPGO, but not the other way around.<o:p></o:p></span></p>
</div>
</div>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal"><span style="color:#4472C4">[wenlei] We intentionally insert pseudo-probe early for better resilience to compiler version changes, knowing that context-sensitivity will be covered by CSSPGO. We could also insert pseudo-probe later like
Instr PGO to cover some context-sensitivity. We choose to do pseudo instrumentation early because we view the combination as package even though they can be decoupled for clean design. That said, I agreed that it’s easier for CSSPGO to work without pseudo-probe
than for pseudo-probe to work without CSSPGO.<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-right:0in">
<div>
<div>
<div>
<div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">There’re other secondary motivations for pseudo-probe as well beyond its profile quality benefits that I didn’t mention earlier:</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">1). Stale profile detection. With line numbers, it’s hard to detect and react to stale profile. Pseudo-probes are tied to blocks so it’s effectively
using CFG as carrier for profile, which makes stale profile detection easier. </span>
<o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">2). Resilience to source changes. We’ve seen cases where deleting a single line of comment caused a 8% perf regression for a large service because it
completely messed up profile annotation for a critical path. That will not happen with pseudo-probe – any source change not altering CFG will be tolerated without perf impact.</span><o:p></o:p></p>
</div>
</div>
</div>
</div>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">While this is true, the problem with CFG based approach is that a local CFG change can make the whole profile losing profile which can be bad too. Debug info based approach
allows partial matching while relying on a propagation algorithm to compensate the rest.<o:p></o:p></span></p>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<p class="MsoNormal"><span style="color:#4472C4">[wenlei] If we want to tolerate local CFG change, and still match majority of CFG, we could employ fuzzy CFG matching, and still using propagation to infer the unmatched parts. I think that should be easy to
do, and more effective than line based fuzzy/partial match still. That’s something we planned to implement too.</span><o:p></o:p></p>
</div>
<div>
<p class="MsoNormal"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-right:0in">
<div>
<div>
<div>
<div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">3). Possibility of offline count inference. We have an experiment that encodes edges alongside with probes (blocks), so more sophisticated offline count
inference algorithm is possible to further improve profile quality. Our algorithm researchers are working on new profile inference solution now.</span><o:p></o:p></p>
</div>
</div>
</div>
</div>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">This is needed because critical edges can not be splitted as instrumentation based PGO?<o:p></o:p></span></p>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal"><span style="color:#4472C4">[wenlei] Yes, this is one of the cases we want to cover. We also have the option to insert nop for critical edges, but we want to avoid that, as it may lead to visible run time overhead.<o:p></o:p></span></p>
</div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<p class="MsoNormal"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-right:0in">
<div>
<div>
<div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:4.0pt"><b><span style="font-size:17.0pt;font-family:"Arial",sans-serif;color:black">Context-sensitive Sample PGO</span></b><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">The effectiveness of BOLT, Propeller and CSPGO all demonstrated the importance of context-sensitive profile for PGO. However
there are two limitations with the existing approaches.</span><o:p></o:p></p>
<ol style="margin-top:0in" start="1" type="1">
<li class="MsoNormal" style="color:black;margin-top:12.0pt;mso-margin-bottom-alt:auto;mso-list:l2 level1 lfo4;vertical-align:baseline">
<span style="font-family:"Arial",sans-serif">The current solutions focus on leveraging a context-sensitive profile to attain an accurate post-inline profile that helps achieve a better code layout, but do not use the context-sensitive profile to drive better
inlining.</span><o:p></o:p></li><li class="MsoNormal" style="color:black;mso-margin-top-alt:auto;margin-bottom:12.0pt;mso-list:l2 level1 lfo4;vertical-align:baseline">
<span style="font-family:"Arial",sans-serif">The current solutions involve multiple training processes and profiles (e.g. a post-inline profile for CSPGO, or a post-link profile for BOLT and Propeller), which incurs higher operational cost and complicates the
build and release workflow.</span><o:p></o:p></li></ol>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">We propose a full context-sensitive sample profiling infrastructure that utilizes both LBR and call stack samples at the
same time to synthesize a profile with a full context sensitivity. The key advantage is that rather than relying on previous inlining or a separate profile, the profile collected with the new approach will have full calling contexts recovered from both inlined
and not inlined call sites. To achieve an accurate post-inline profile, a separate profile is no longer needed. Instead, the post-inline profile can be directly derived from adjusting the input profile based on all inline decisions. The richer context-sensitive
profile also enables better inline decisions. The infrastructure has two key components listed below.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:4.0pt"><b><span style="font-size:13.0pt;font-family:"Arial",sans-serif;color:black">Synthesizing context-sensitive LBR with a virtual unwinder</span></b><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">To make sample PGO’s input profile context aware, we need to know the call stack of each LBR fall through path. That is
done by sampling LBR and call stack simultaneously. With that, each sample will contain a call stack in addition to LBR entries. We use level 2 PEBS to control sampling skid so that the leaf frame from stack sample aligns with leaf frame from LBR. The raw
call stack sample describes the calling context for the leaf LBR entry. In addition, by unwinding “call” and “return” (including implicit ones from inlinee) from LBR entries backwards on top of raw stack samples, we can recover the calling context for each
of the LBR entries from the sample, thus synthesizing context-sensitive LBR profile.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">We can then generate context-sensitive sample PGO profile using the context-sensitive LBR profile. In the new profile, a
function’s profile becomes a collection of profiles, each representing a profile for a given calling context.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">Sounds good -- see the overhead question posted at the beginning.</span><o:p></o:p></p>
</div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:4.0pt"><b><span style="font-size:13.0pt;font-family:"Arial",sans-serif;color:black">Context-sensitive FDO/PGO framework in LLVM</span></b><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">In order to leverage context-sensitive profile for inlining, and to maintain accurate post-inline counts, we introduced </span><span style="font-family:"Courier New";color:black">SampleContextTracker</span><span style="font-family:"Arial",sans-serif;color:black"> which
is a layer sitting in between input profile and the profile used to annotate CFG for optimizations. We also introduced the notion of base profile which </span><span style="font-family:"Arial",sans-serif;color:#1C1E21">is the merged profile for function’s profiles
from any outstanding (not inlined) context, </span><span style="font-family:"Arial",sans-serif;color:black">and context profile which is a function's profile for a given calling context. The framework includes four simple APIs for updating and query profiles:</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">Query API:</span><o:p></o:p></p>
<ul style="margin-top:0in" type="disc">
<li class="MsoNormal" style="color:black;margin-top:12.0pt;mso-margin-bottom-alt:auto;mso-list:l4 level1 lfo5;vertical-align:baseline">
<span style="font-family:"Courier New"">getBaseSamplesFor</span><span style="font-family:"Arial",sans-serif">: Query base profile by function name.</span><o:p></o:p></li><li class="MsoNormal" style="color:black;mso-margin-top-alt:auto;margin-bottom:12.0pt;mso-list:l4 level1 lfo5;vertical-align:baseline">
<span style="font-family:"Courier New"">getContextSamplesFor</span><span style="font-family:"Arial",sans-serif">: Query context profile by calling context and function name.</span><o:p></o:p></li></ul>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">Update API:</span><o:p></o:p></p>
<ul style="margin-top:0in" type="disc">
<li class="MsoNormal" style="color:black;margin-top:12.0pt;mso-margin-bottom-alt:auto;mso-list:l1 level1 lfo6;vertical-align:baseline">
<span style="font-family:"Courier New"">MarkContextSamplesInlined</span><span style="font-family:"Arial",sans-serif">: When a function is inlined for a given calling context, we need to mark the context profile for that context as inlined. This is to make sure
we don't include inlined context profile when synthesizing the base profile.</span><o:p></o:p></li><li class="MsoNormal" style="color:black;mso-margin-top-alt:auto;margin-bottom:12.0pt;mso-list:l1 level1 lfo6;vertical-align:baseline">
<span style="font-family:"Courier New"">PromoteMergeContextSamplesTree</span><span style="font-family:"Arial",sans-serif">: When a function is not inlined for a given calling context, we need to promote the context profile tree to be top-level context. This
preserves the child context under that function so later inline decisions for calls originating from that not inlined function will still be driven by an accurate context profile.</span><o:p></o:p></li></ul>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">These APIs are used by </span><span style="font-family:"Courier New";color:black">SampleProfileLoader</span><span style="font-family:"Arial",sans-serif;color:black">’s
inlining, for better inline decisions and better post-inline counts. For optimal results, the new infrastructure needs to work with a top-down FDO inliner. We added top-down FDO inlining under a switch, and the switch is turned on by default in upstream recently.
There’re a few other improvements for the FDO inliner that we plan to upstream soon.</span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">The profile data should be usable by the SCC inliner as well. In the bottom up inlining, as the function gets
inline further up in the call chain, the inline instance has few incoming contexts to merge.</span><o:p></o:p></p>
</div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] Yes, we intentionally introduced the SampleContextTracker abstraction that is decoupled from SampleProfileLoader, so it can work with both FDO
inliner and SCC inliner. But we expect FDO inliner to take over more inlining for CSSPGO because the FDO inliner is no longer a replay inliner now. And it’s good as top-down inline helps with specialization which is important for context-sensitive inlining.</span><o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:4.0pt"><b><span style="font-size:17.0pt;font-family:"Arial",sans-serif;color:black">Pseudo-instrumentation for sample to IR mapping</span></b><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">Being able to profile production binaries is a key advantage of AutoFDO over Instrumentation PGO, but it also comes with
a big challenge. While using line number and discriminator as anchor for profile mapping incurs zero run time overhead for AutoFDO, it’s not as accurate as instrumented probes. This is because the instrumented probes are part of the IR, rather than metadata
attached to the IR like </span><span style="font-family:"Courier New";color:black">!dbg</span><span style="font-family:"Arial",sans-serif;color:black">. That has two implications: 1) it’s easier to maintain IR than metadata for optimization passes; 2) probe
blocks some CFG transformations that can mess up profile correlation.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">With the proposed pseudo instrumentation, we can achieve most of the benefit of instrumentation PGO in little runtime overhead.
We instrument each basic block with a pseudo probe associated with the block Id. Unlike in PGO instrumentation where a counter is implemented as a persisting operation such as atomic read/write or runtime helper call, a pseudo probe is implemented as a dedicated
intrinsic call with </span><span style="font-family:"Courier New";color:black">IntrInaccessibleMemOnly</span><span style="font-family:"Arial",sans-serif;color:black"> flag. The intrinsic comes with </span><span style="font-family:"Arial",sans-serif;color:#1C1E21">most
of the semantics of a PGO </span><span style="font-family:"Arial",sans-serif;color:black">counter</span><span style="font-family:"Arial",sans-serif;color:#1C1E21"> but is much less optimization-</span><span style="font-family:"Arial",sans-serif;color:black">intrusive. </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">The pseudo probe intrinsic calls are on the IR throughout the optimization and code generation pipeline and are materialized
as a piece of binary data stored in a separate </span><span style="font-family:"Courier New";color:black">.pseudo_probe</span><span style="font-family:"Arial",sans-serif;color:black"> data section.</span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">How are these information maintained? Blocks can be eliminated or cloned in many optimization passes: jump threading,
taildup, unrolling, peeling etc. For instance, how to handle the block that is merged into another? Does it lose samples because of this?</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] They are just maintained as part of IR, like any other instructions, without special care. The key difference is they’re part of IR instead
of metadata attached to IR. We can categorize relevant CFG transformations into 1) duplication, 2) merge and removal.
</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">For any duplication, tail/head dup, unrolling, probe will be duplicated along with other instructions, and we don’t need duplication factor that was
used by dwarf-based approach, because counts from duplicated probes will be added together naturally. For merge and removal,
</span><span style="font-family:"Courier New";color:#4472C4">IntrInaccessibleMemOnly</span><span style="font-family:"Arial",sans-serif;color:#4472C4"> flag will block it, similar to real probes.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:#4472C4"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:#4472C4">Pseudo-probe is a framework that is tunable. Depending on the semantic we put on the intrinsic, it can be as heavy as
real probe, or as light as a label. </span><span style="font-family:"Courier New";color:#4472C4">IntrInaccessibleMemOnly</span><span style="font-family:"Arial",sans-serif;color:#4472C4"> is a carefully chosen semantic based on our experiments that balances
run time overhead and profile quality – it doesn’t incur measure-able overhead even though it still blocks merging and removal, we didn’t see measure-able overhead from SPEC or a large internal workload. But the profile quality improvement is measure-able
as the 1st table in result section shows.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">The section is then used to map binary samples back to blocks of CFG during profile generation. There are also no real machine
instructions generated for a pseudo probe and the</span><span style="font-family:"Courier New";color:black">.pseudo_probe</span><span style="font-family:"Arial",sans-serif;color:black"> section won’t be loaded into memory at runtime, therefore they should
incur very little runtime overhead. As a fact, we see no measure-able performance impact from pseudo-instrumentation itself on SPEC2006 or big internal workload.</span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">How large are the probe sections? </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] About 10% of binary size, another 2% if we encode CFG edges in addition to probes/blocks.</span><o:p></o:p></p>
</div>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:4.0pt"><b><span style="font-size:13.0pt;font-family:"Arial",sans-serif;color:black">Pseudo-instrumentation integration and Pass Ordering</span></b><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">One implication from pseudo-probe instrumentation is that the profile is now sensitive to CFG changes. We now defect stale
profiles for sample PGO via CFG checksum, instead of just using it. However, the potential downside is that CFG may change between different versions of the compiler even if the source code is unchanged. To solve that problem, we perform the pseudo instrumentation
very early in the pre-LTO pipeline, before any CFG transformation. This ensures that the CFG instrumented and annotated is stable. We added </span><span style="font-family:"Courier New";color:black">SampleProfileProber</span><span style="font-family:"Arial",sans-serif;color:black"> that
performs the pseudo instrumentation and runs independent of profile annotation.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">A new switch </span><span style="font-family:"Courier New";color:black">-fpseudo-probe-for-profiling</span><span style="font-family:"Arial",sans-serif;color:black"> is
added to enable sample PGO with pseudo instrumentation, similar to </span><span style="font-family:"Courier New";color:black">-fdebug-info-for-profiling</span><span style="font-family:"Arial",sans-serif;color:black"> for AutoFDO. Input profile is still provided
through the same switch used by today’s AutoFDO, namely </span><span style="font-family:"Courier New";color:black">-fprofile-sample-use</span><span style="font-family:"Arial",sans-serif;color:black">, and the profile loader will automatically decide how to
load and annotate profile depending on whether input profile is dwarf-based or pseudo-probe based.</span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">Can you compare the source change tolerance of pseudo probe based approach vs debug info based approach?</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] Pseudo-probe should be more resilient to source changes. See my reply for motivation of pseudo-probe. Pseudo-probe will be able to tolerate
source changes as long as they don’t alter CFG. On the contrary, changes that delete a comment and shift line offset can cause perf churn with line-based approach. We've been bitten by this a few times – people making comment only change during holiday freeze
only to find surprising perf regression due to AutoFDO </span><span style="font-family:"Apple Color Emoji";color:#4472C4">😊</span><span style="color:#4472C4">. It also opens up possibility of fuzzy CFG matching when there’s a CFG mutation due to source change
to make it even more resilient. </span><o:p></o:p></p>
</div>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
</div>
</div>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">Ok. Also see my reply above. It seems to me that the line shifting problem should be solvable for AFDO (or make it more tolerant).
<o:p></o:p></span></p>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal"><span style="color:#4472C4">[wenlei] Agreed that we can do better with line number approach too. But CFG as profile carrier has richer info than line, and is closer to profile which is inherently CFG based. So I think it should be easier
with probe and CFG. <o:p></o:p></span></p>
</div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<p class="MsoNormal"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-right:0in">
<div>
<div>
<div>
<div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:4.0pt"><b><span style="font-size:17.0pt;font-family:"Arial",sans-serif;color:black">New profile format and profile generation</span></b><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">We extend current profile format in order to be able to represent a full context-sensitive profile and also encode pseudo-probe
info. This is done without drastically diverging from today’s AutoFDO profile format so that existing tools and libraries can be reused with minor changes (e.g. </span><span style="font-family:"Courier New";color:black">llvm-profdata</span><span style="font-family:"Arial",sans-serif;color:black">,
profiler reader and writer).</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">For a context-sensitive profile, we extend the profile format by changing the function profile header line to include calling
context in addition to function name. With today’s AutoFDO, we have a single profile header for each function to represent its accumulative profile. E.g. This is the profile header for </span><span style="font-family:"Courier New";color:black">foo</span><span style="font-family:"Arial",sans-serif;color:black">,
with 1290 total samples, and 74 header samples.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Courier New";color:black">foo:1290:74</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">For CSSPGO, we will have multiple profile headers for a single function’s profile, each representing profile for a specific
calling context as shown below. CSSPGO profile header is bracketed to differentiate from today’s AutoFDO.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Courier New";color:black">[main:12 @ bar:3 @ foo]:279:54</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Courier New";color:black">[main:19 @ zoo:7 @ foo]:1011:20</span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">sounds good.</span><o:p></o:p></p>
</div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">With calling context encoded in the function header, we no longer need a nested function profile for inlinees. Instead,
a context profile will be represented uniformly using context strings in the function profile header, regardless of whether the calls in the context are inlined or not. The flat structure makes sure that context profile is easily indexable. The change is mostly
transparent to tools like </span><span style="font-family:"Courier New";color:black">llvm-profdata</span><span style="font-family:"Arial",sans-serif;color:black">. Support for binary profile format has not been added yet, but should be easy to do.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">It is still useful to annotate (as least with comment line) that a profile is for top level function or inline
instance.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] Agreed, and that’s in our plan too - we need that for tuning purpose.
</span><o:p></o:p></p>
</div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">For pseudo-probe, we repurposed the line to count map of AutoFDO profile to be block Id to count map. This only changes
the interpretation of profile content rather than the representation, hence all reader/writer helpers can be reused.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">There's a new profile generation tool, </span><span style="font-family:"Courier New";color:black">llvm-profgen</span><span style="font-family:"Arial",sans-serif;color:black">,
with the virtual winder implemented for context-sensitive profiling, and uses the </span><span style="font-family:"Courier New";color:black">.pseudo_probe</span><span style="font-family:"Arial",sans-serif;color:black"> section to map binary profile to pre-opt
CFG profile. Since profile generation is a critical piece of the workflow, we’d like to propose to include the tool as part of LLVM, alongside with </span><span style="font-family:"Courier New";color:black">llvm-profdata</span><span style="font-family:"Arial",sans-serif;color:black">.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:4.0pt"><b><span style="font-size:17.0pt;font-family:"Arial",sans-serif;color:black">Preliminary Results</span></b><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">To quantitatively assess profile quality improvement brought by pseudo-instrumentation, we introduce a profile quality metric.
We measure the metric by first annotating an optimized binary with the MIR block execution counts derived from a profile. The binary is then sampled and the counts are compared against the annotation. The weighted relative delta is used as an indicator for
profile quality (lower is better). </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">Table below shows the profile quality metric for SPEC2006. We can see from the numbers that the profile quality of pseudo-instrumentation
sample PGO is much better than AutoFDO and close to instrumentation PGO.</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:12.0pt"><span style="color:black"> </span><o:p></o:p></p>
<table class="MsoNormalTable" border="0" cellspacing="0" cellpadding="0" width="624" style="width:6.5in;border-collapse:collapse">
<tbody>
<tr>
<td valign="top" style="border:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-family:"Arial",sans-serif;color:black">Profile quality metric</span><o:p></o:p></p>
</td>
<td valign="top" style="border:solid black 1.0pt;border-left:none;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-family:"Arial",sans-serif;color:black">Baseline AutoFDO</span><o:p></o:p></p>
</td>
<td valign="top" style="border:solid black 1.0pt;border-left:none;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-family:"Arial",sans-serif;color:black">Instrumentation PGO</span><o:p></o:p></p>
</td>
<td valign="top" style="border:solid black 1.0pt;border-left:none;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-size:9.0pt;font-family:"Arial",sans-serif;color:black">Sample PGO w/ Pseudo Instrumentation</span><o:p></o:p></p>
</td>
</tr>
<tr>
<td valign="top" style="border:solid black 1.0pt;border-top:none;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">SPEC2006</span><o:p></o:p></p>
</td>
<td valign="top" style="border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="right" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:right">
<span style="font-family:"Arial",sans-serif;color:black">24.58%</span><o:p></o:p></p>
</td>
<td valign="top" style="border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="right" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:right">
<span style="font-family:"Arial",sans-serif;color:black">15.70%</span><o:p></o:p></p>
</td>
<td valign="top" style="border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="right" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:right">
<span style="font-family:"Arial",sans-serif;color:black">16.21%</span><o:p></o:p></p>
</td>
</tr>
</tbody>
</table>
<p class="MsoNormal" style="mso-margin-top-alt:auto;margin-bottom:12.0pt"><span style="color:black"> </span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">Instrumentation PGO does not have context sensitivity, so I would expect it scores worse than CSSPGO. Do you
know why it is better here?</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] This is for evaluating effectiveness of pseudo-probe exclusively. We have all inlining turned off for this experiment, and this is without context-sensitive
profile for Sample PGO either, so the comparison should be fair, and Instrumentation PGO should be the upper bound.</span><o:p></o:p></p>
</div>
</div>
</div>
</div>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<p class="MsoNormal"><o:p> </o:p></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">It would be nice to see the main source of precision loss of AFDO here. Probably related to the missing edge information Wei mentioned.<o:p></o:p></span></p>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal"><span style="color:#4472C4">[wenlei] The edge count issue Wei mentioned isn’t handled by pseudo probe either, at least not for now. From our investigation, the problem here is more like death by a thousand cut.
<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black"><o:p> </o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black"><o:p> </o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">thanks,<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black"><o:p> </o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="font-size:12.0pt;font-family:"Courier New";color:black">David<o:p></o:p></span></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-right:0in">
<div>
<div>
<div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">We also measured performance and code size on SPEC2006 with CSSPGO. The measurement was done with MonoLTO and new pass manager,
with tuning for FDO inliner to accommodate context-sensitive profile, and used training dataset for both pass1 and pass2. The result shows ~2% performance win on top of today’s AutoFDO, with ~4% .text reduction, see table below. </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<table class="MsoNormalTable" border="0" cellspacing="0" cellpadding="0" width="624" style="width:6.5in;border-collapse:collapse">
<tbody>
<tr style="height:21.0pt">
<td rowspan="2" valign="top" style="border:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt;height:21.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-family:"Arial",sans-serif;color:black">SPEC2006</span><o:p></o:p></p>
</td>
<td colspan="3" valign="top" style="border:solid black 1.0pt;border-left:none;padding:5.0pt 5.0pt 5.0pt 5.0pt;height:21.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-family:"Arial",sans-serif;color:black">Performance</span><o:p></o:p></p>
</td>
<td colspan="3" valign="top" style="border:solid black 1.0pt;border-left:none;padding:5.0pt 5.0pt 5.0pt 5.0pt;height:21.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-family:"Arial",sans-serif;color:black">Code Size</span><o:p></o:p></p>
</td>
</tr>
<tr style="height:21.0pt">
<td valign="top" style="border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt;height:21.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-size:10.0pt;font-family:"Arial",sans-serif;color:black">AutoFDO over LTO</span><o:p></o:p></p>
</td>
<td width="79" valign="top" style="width:59.0pt;border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt;height:21.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-size:10.0pt;font-family:"Arial",sans-serif;color:black">CSSPGO</span><o:p></o:p></p>
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-size:10.0pt;font-family:"Arial",sans-serif;color:black">Over LTO</span><o:p></o:p></p>
</td>
<td width="89" valign="top" style="width:66.55pt;border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt;height:21.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-size:9.0pt;font-family:"Arial",sans-serif;color:black">CSSPGO over AutoFDO</span><o:p></o:p></p>
</td>
<td valign="top" style="border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt;height:21.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-size:10.0pt;font-family:"Arial",sans-serif;color:black">AutoFDO over LTO</span><o:p></o:p></p>
</td>
<td valign="top" style="border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt;height:21.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-size:10.0pt;font-family:"Arial",sans-serif;color:black">CSSPGO</span><o:p></o:p></p>
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-size:10.0pt;font-family:"Arial",sans-serif;color:black">Over LTO</span><o:p></o:p></p>
</td>
<td valign="top" style="border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt;height:21.0pt">
<p class="MsoNormal" align="center" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:center">
<span style="font-size:9.0pt;font-family:"Arial",sans-serif;color:black">CSSPGO over AutoFDO</span><o:p></o:p></p>
</td>
</tr>
<tr>
<td valign="top" style="border:solid black 1.0pt;border-top:none;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">Geomean Delta %</span><o:p></o:p></p>
</td>
<td valign="top" style="border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="right" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:right">
<span style="font-family:"Arial",sans-serif;color:#6AA84F">6.80%</span><o:p></o:p></p>
</td>
<td width="79" valign="top" style="width:59.0pt;border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="right" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:right">
<span style="font-family:"Arial",sans-serif;color:#6AA84F">8.70%</span><o:p></o:p></p>
</td>
<td width="89" valign="top" style="width:66.55pt;border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="right" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:right">
<span style="font-family:"Arial",sans-serif;color:#6AA84F">2.04%</span><o:p></o:p></p>
</td>
<td valign="top" style="border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="right" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:right">
<span style="font-family:"Arial",sans-serif;color:#CC0000">11.17%</span><o:p></o:p></p>
</td>
<td valign="top" style="border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="right" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:right">
<span style="font-family:"Arial",sans-serif;color:#CC0000">6.66%</span><o:p></o:p></p>
</td>
<td valign="top" style="border-top:none;border-left:none;border-bottom:solid black 1.0pt;border-right:solid black 1.0pt;padding:5.0pt 5.0pt 5.0pt 5.0pt">
<p class="MsoNormal" align="right" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;text-align:right">
<span style="font-family:"Arial",sans-serif;color:#CC0000">4.06%</span><o:p></o:p></p>
</td>
</tr>
</tbody>
</table>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">While the SPEC2006 benchmark suite is different from large workloads, we think the results demonstrated the potential of
CSSPGO and served its purpose for proof of concept. We plan to continue tuning and start to evaluate larger internal workloads soon, and we’d like to upstream our work. Feedbacks are welcomed!</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black"> </span><o:p></o:p></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">What is the performance win with peudo-probe alone?</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:#4472C4">[wenlei] We don’t have numbers for pseudo-probe along. As I mentioned earlier, profile quality improvement may not translate directly to perf win without
heuristic changes. That’s why we evaluate pseudo-probe exclusively with profile quality metric. The hope is that it will open up opportunity for better optimizations. E.g. it could potentially help the Machine Function Splitting pass too. That said, pseudo-probe
does bring extra win for CSSPGO comparing to line-based CSSPGO for some benchmarks, but we didn’t tune CSSPGO with line-based profile, so we didn’t aggregate numbers as the comparison isn’t fair either.</span><o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New""> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">thanks,</span><o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black"> </span><o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-size:12.0pt;font-family:"Courier New";color:black">David</span><o:p></o:p></p>
</div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"> <o:p></o:p></p>
</div>
<blockquote style="border:none;border-left:solid #CCCCCC 1.0pt;padding:0in 0in 0in 6.0pt;margin-left:4.8pt;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt">
<div>
<div>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">Thanks,</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="font-family:"Arial",sans-serif;color:black">Wenlei & Hongtao</span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span style="color:black"> </span><o:p></o:p></p>
</div>
</div>
</blockquote>
</div>
</div>
</div>
</div>
</blockquote>
</div>
</div>
</div>
</body>
</html>