<html>

  <head>

    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">

  </head>

  <body text="#000000" bgcolor="#FFFFFF">

    <br>

    <div class="moz-cite-prefix">On 07/23/2018 08:25 PM, Saito, Hideki

      wrote:<br>

    </div>

    <blockquote type="cite"

cite="mid:899F03F2C73A55449C51631866B88749619D4C7C@FMSMSX109.amr.corp.intel.com">

      <meta http-equiv="Content-Type" content="text/html; charset=utf-8">

      <meta name="Generator" content="Microsoft Word 15 (filtered

        medium)">

      <style><!--

/* Font Definitions */

@font-face

        {font-family:"MS Mincho";

        panose-1:2 2 6 9 4 2 5 8 3 4;}

@font-face

        {font-family:"Cambria Math";

        panose-1:2 4 5 3 5 4 6 3 2 4;}

@font-face

        {font-family:Calibri;

        panose-1:2 15 5 2 2 2 4 3 2 4;}

@font-face

        {font-family:Consolas;

        panose-1:2 11 6 9 2 2 4 3 2 4;}

@font-face

        {font-family:"\@MS Mincho";

        panose-1:2 2 6 9 4 2 5 8 3 4;}

/* Style Definitions */

p.MsoNormal, li.MsoNormal, div.MsoNormal

        {margin:0in;

        margin-bottom:.0001pt;

        font-size:12.0pt;

        font-family:"Times New Roman",serif;}

a:link, span.MsoHyperlink

        {mso-style-priority:99;

        color:blue;

        text-decoration:underline;}

a:visited, span.MsoHyperlinkFollowed

        {mso-style-priority:99;

        color:purple;

        text-decoration:underline;}

pre

        {mso-style-priority:99;

        mso-style-link:"HTML Preformatted Char";

        margin:0in;

        margin-bottom:.0001pt;

        font-size:10.0pt;

        font-family:"Courier New";}

p.MsoListParagraph, li.MsoListParagraph, div.MsoListParagraph

        {mso-style-priority:34;

        margin-top:0in;

        margin-right:0in;

        margin-bottom:0in;

        margin-left:.5in;

        margin-bottom:.0001pt;

        font-size:12.0pt;

        font-family:"Times New Roman",serif;}

span.HTMLPreformattedChar

        {mso-style-name:"HTML Preformatted Char";

        mso-style-priority:99;

        mso-style-link:"HTML Preformatted";

        font-family:Consolas;}

span.EmailStyle19

        {mso-style-type:personal-reply;

        font-family:"Calibri",sans-serif;

        color:#1F497D;}

.MsoChpDefault

        {mso-style-type:export-only;

        font-family:"Calibri",sans-serif;}

@page WordSection1

        {size:8.5in 11.0in;

        margin:1.0in 1.0in 1.0in 1.0in;}

div.WordSection1

        {page:WordSection1;}

/* List Definitions */

@list l0

        {mso-list-id:1688824935;

        mso-list-type:hybrid;

        mso-list-template-ids:51669138 67698705 67698713 67698715 67698703 67698713 67698715 67698703 67698713 67698715;}

@list l0:level1

        {mso-level-text:"%1\)";

        mso-level-tab-stop:none;

        mso-level-number-position:left;

        text-indent:-.25in;}

@list l0:level2

        {mso-level-number-format:alpha-lower;

        mso-level-tab-stop:none;

        mso-level-number-position:left;

        text-indent:-.25in;}

@list l0:level3

        {mso-level-number-format:roman-lower;

        mso-level-tab-stop:none;

        mso-level-number-position:right;

        text-indent:-9.0pt;}

@list l0:level4

        {mso-level-tab-stop:none;

        mso-level-number-position:left;

        text-indent:-.25in;}

@list l0:level5

        {mso-level-number-format:alpha-lower;

        mso-level-tab-stop:none;

        mso-level-number-position:left;

        text-indent:-.25in;}

@list l0:level6

        {mso-level-number-format:roman-lower;

        mso-level-tab-stop:none;

        mso-level-number-position:right;

        text-indent:-9.0pt;}

@list l0:level7

        {mso-level-tab-stop:none;

        mso-level-number-position:left;

        text-indent:-.25in;}

@list l0:level8

        {mso-level-number-format:alpha-lower;

        mso-level-tab-stop:none;

        mso-level-number-position:left;

        text-indent:-.25in;}

@list l0:level9

        {mso-level-number-format:roman-lower;

        mso-level-tab-stop:none;

        mso-level-number-position:right;

        text-indent:-9.0pt;}

ol

        {margin-bottom:0in;}

ul

        {margin-bottom:0in;}

--></style><!--[if gte mso 9]><xml>

<o:shapedefaults v:ext="edit" spidmax="1026" />

</xml><![endif]--><!--[if gte mso 9]><xml>

<o:shapelayout v:ext="edit">

<o:idmap v:ext="edit" data="1" />

</o:shapelayout></xml><![endif]-->

      <div class="WordSection1">

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><o:p> </o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">My

            perspective, being a vectorizer guy, is that vectorizer

            should<o:p></o:p></span></p>

        <p class="MsoListParagraph"

          style="text-indent:-.25in;mso-list:l0 level1 lfo1"><!--[if !supportLists]--><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><span

              style="mso-list:Ignore">1)<span style="font:7.0pt

                "Times New Roman"">     

              </span></span></span><!--[endif]--><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Take

            this optimization into account in the cost modeling so that

            it favors full vector compute.<o:p></o:p></span></p>

        <p class="MsoListParagraph"

          style="text-indent:-.25in;mso-list:l0 level1 lfo1"><!--[if !supportLists]--><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><span

              style="mso-list:Ignore">2)<span style="font:7.0pt

                "Times New Roman"">     

              </span></span></span><!--[endif]--><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">but

            generate plain widened computation:<br>

            full vector unit stride load of A[],<br>

            full vector unit stride load of B[],<br>

            sign extend both, (this makes it 2x full vector, on the

            surface)<br>

            multiply<br>

            add<br>

            …<br>

            standard reduction last value sequence after the loop<o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">and

            let downstream optimizer, possibly in Target, use

            instructions like (v)pmaddwd effectively.<o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">If

            needed, an IR-to-IR xform before hitting Target.<o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><o:p> </o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">This

            mechanism also works if a programmer or other FE produces a

            similar naïvely vectorized IR like above.</span></p>

      </div>

    </blockquote>

    <br>

    I think that this should be, generally, our strategy. We might have

    different reduction strategies (we already do, at least in terms of

    the final reduction tree), and one might include what x86 wants

    here, so long as we can reasonably create a cost-modeling interface

    that let's us differentiate it from other strategies at the IR

    level. Lacking the ability to abstract this behind a generalized

    strategy with an IR-level cost-modeling interface, I think that the

    vectorizer should produce straightforward IR (e.g., what we

    currently produce with VF=16, see the other discussion of the

    vectorizer-maximize-bandwidth option) and then the target can adjust

    it as necessary to take advantage of special isel opportunities.<br>

    <br>

    Thanks again,<br>

    Hal<br>

    <br>

    <blockquote type="cite"

cite="mid:899F03F2C73A55449C51631866B88749619D4C7C@FMSMSX109.amr.corp.intel.com">

      <div class="WordSection1">

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Since

            vectorizer should understand the existence of the

            optimization, it can certainly be arm-twisted to<o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">generate

            the IR desired by the Target. However, whether we want to do

            that is a totally different story.<o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><o:p> </o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Vectorizer

            should focus on having reasonable cost model and generating

            straight-forward optimizable IR<o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">----

            as opposed to generating convoluted IR (such as breaking up

            unit-stride load into even/odd, simply<o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">to

            put them back to unit-stride again) wanted by the Target.<o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><o:p> </o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">My

            recommendation is first analyzing the source of the current

            code generation deficiencies and then<o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">try

            to remedy it there.<o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><o:p> </o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Thanks,<o:p></o:p></span></p>

        <p class="MsoNormal"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Hideki<o:p></o:p></span></p>

        <p class="MsoNormal"><a name="_MailEndCompose"

            moz-do-not-send="true"><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><o:p> </o:p></span></a></p>

        <p class="MsoNormal"><a name="_____replyseparator"

            moz-do-not-send="true"></a><b><span

              style="font-size:11.0pt;font-family:"Calibri",sans-serif">From:</span></b><span

style="font-size:11.0pt;font-family:"Calibri",sans-serif">

            Craig Topper [<a class="moz-txt-link-freetext" href="mailto:craig.topper@gmail.com">mailto:craig.topper@gmail.com</a>]

            <br>

            <b>Sent:</b> Monday, July 23, 2018 4:37 PM<br>

            <b>To:</b> Hal Finkel <a class="moz-txt-link-rfc2396E" href="mailto:hfinkel@anl.gov"><hfinkel@anl.gov></a><br>

            <b>Cc:</b> Saito, Hideki <a class="moz-txt-link-rfc2396E" href="mailto:hideki.saito@intel.com"><hideki.saito@intel.com></a>;

            <a class="moz-txt-link-abbreviated" href="mailto:estotzer@ti.com">estotzer@ti.com</a>; Nemanja Ivanovic

            <a class="moz-txt-link-rfc2396E" href="mailto:nemanja.i.ibm@gmail.com"><nemanja.i.ibm@gmail.com></a>; Adam Nemet

            <a class="moz-txt-link-rfc2396E" href="mailto:anemet@apple.com"><anemet@apple.com></a>; <a class="moz-txt-link-abbreviated" href="mailto:graham.hunter@arm.com">graham.hunter@arm.com</a>; Michael

            Kuperstein <a class="moz-txt-link-rfc2396E" href="mailto:mkuper@google.com"><mkuper@google.com></a>; Sanjay Patel

            <a class="moz-txt-link-rfc2396E" href="mailto:spatel@rotateright.com"><spatel@rotateright.com></a>; Simon Pilgrim

            <a class="moz-txt-link-rfc2396E" href="mailto:llvm-dev@redking.me.uk"><llvm-dev@redking.me.uk></a>; <a class="moz-txt-link-abbreviated" href="mailto:ashutosh.nema@amd.com">ashutosh.nema@amd.com</a>;

            llvm-dev <a class="moz-txt-link-rfc2396E" href="mailto:llvm-dev@lists.llvm.org"><llvm-dev@lists.llvm.org></a><br>

            <b>Subject:</b> Re: [LoopVectorizer] Improving the

            performance of dot product reduction loop<o:p></o:p></span></p>

        <p class="MsoNormal"><o:p> </o:p></p>

        <div>

          <p class="MsoNormal"><br clear="all">

            <o:p></o:p></p>

          <div>

            <div>

              <p class="MsoNormal">~Craig<o:p></o:p></p>

            </div>

          </div>

          <p class="MsoNormal" style="margin-bottom:12.0pt"><o:p> </o:p></p>

          <div>

            <div>

              <p class="MsoNormal">On Mon, Jul 23, 2018 at 4:24 PM Hal

                Finkel <<a href="mailto:hfinkel@anl.gov"

                  target="_blank" moz-do-not-send="true">hfinkel@anl.gov</a>>

                wrote:<o:p></o:p></p>

            </div>

            <blockquote style="border:none;border-left:solid #CCCCCC

              1.0pt;padding:0in 0in 0in

              6.0pt;margin-left:4.8pt;margin-right:0in">

              <div>

                <p class="MsoNormal"><o:p> </o:p></p>

                <div>

                  <p class="MsoNormal">On 07/23/2018 05:22 PM, Craig

                    Topper wrote:<o:p></o:p></p>

                </div>

                <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">

                  <div>

                    <div>

                      <p class="MsoNormal">Hello all,<o:p></o:p></p>

                    </div>

                    <div>

                      <p class="MsoNormal"><o:p> </o:p></p>

                    </div>

                    <div>

                      <p class="MsoNormal">This code <a

                          href="https://godbolt.org/g/tTyxpf"

                          target="_blank" moz-do-not-send="true">

                          https://godbolt.org/g/tTyxpf</a> is a dot

                        product reduction loop multipying sign extended

                        16-bit values to produce a 32-bit accumulated

                        result. The x86 backend is currently not able to

                        optimize it as well as gcc and icc. The IR we

                        are getting from the loop vectorizer has several

                        v8i32 adds and muls inside the loop. These are

                        fed by v8i16 loads and sexts from v8i16 to

                        v8i32. The x86 backend recognizes that these are

                        addition reductions of multiplication so we use

                        the vpmaddwd instruction which calculates 32-bit

                        products from 16-bit inputs and does a

                        horizontal add of adjacent pairs. A vpmaddwd

                        given two v8i16 inputs will produce a v4i32

                        result.<o:p></o:p></p>

                    </div>

                  </div>

                </blockquote>

              </div>

            </blockquote>

            <div>

              <p class="MsoNormal"><o:p> </o:p></p>

            </div>

            <div>

              <p class="MsoNormal">That godbolt link seems wrong. It

                wasn't supposed to be clang IR. This should be right.<o:p></o:p></p>

            </div>

            <div>

              <p class="MsoNormal"> <o:p></o:p></p>

            </div>

            <blockquote style="border:none;border-left:solid #CCCCCC

              1.0pt;padding:0in 0in 0in

              6.0pt;margin-left:4.8pt;margin-right:0in">

              <div>

                <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">

                  <div>

                    <div>

                      <p class="MsoNormal"><o:p> </o:p></p>

                    </div>

                    <div>

                      <p class="MsoNormal">In the example code, because

                        we are reducing the number of elements from

                        8->4 in the vpmaddwd step we are left with a

                        width mismatch between vpmaddwd and the vpaddd

                        instruction that we use to sum with the results

                        from the previous loop iterations. We rely on

                        the fact that a 128-bit vpmaddwd zeros the upper

                        bits of the register so that we can use a

                        256-bit vpaddd instruction so that the upper

                        elements can keep going around the loop without

                        being disturbed in case they weren't initialized

                        to 0. But this still means the vpmaddwd

                        instruction is doing half the amount of work the

                        CPU is capable of if we had been able to use a

                        256-bit vpmaddwd instruction. Additionally,

                        future x86 CPUs will be gaining an instruction

                        that can do VPMADDWD and VPADDD in one

                        instruction, but that width mismatch makes that

                        instruction difficult to utilize.<o:p></o:p></p>

                    </div>

                    <div>

                      <p class="MsoNormal"><o:p> </o:p></p>

                    </div>

                    <div>

                      <p class="MsoNormal">In order for the backend to

                        handle this better it would be great if we could

                        have something like two v32i8 loads, two

                        shufflevectors to extract the even elements and

                        the odd elements to create four v16i8 pieces.<o:p></o:p></p>

                    </div>

                  </div>

                </blockquote>

                <p class="MsoNormal"><br>

                  Why v*i8 loads? I thought that we have 16-bit and

                  32-bit types here?<o:p></o:p></p>

              </div>

            </blockquote>

            <div>

              <p class="MsoNormal"><o:p> </o:p></p>

            </div>

            <div>

              <p class="MsoNormal">Oops that should have been v16i16.

                Mixed up my 256-bit types.<o:p></o:p></p>

            </div>

            <div>

              <p class="MsoNormal"> <o:p></o:p></p>

            </div>

            <blockquote style="border:none;border-left:solid #CCCCCC

              1.0pt;padding:0in 0in 0in

              6.0pt;margin-left:4.8pt;margin-right:0in">

              <div>

                <p class="MsoNormal"><br>

                  <br>

                  <o:p></o:p></p>

                <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">

                  <div>

                    <div>

                      <p class="MsoNormal">Sign extend each of those

                        pieces. Multiply the two even pieces and the two

                        odd pieces separately, sum those results with a

                        v8i32 add. Then another v8i32 add to accumulate

                        the previous loop iterations. Then ensures that

                        no pieces exceed the target vector width and the

                        final operation is correctly sized to go around

                        the loop in one register. All but the last add

                        can then be pattern matched to vpmaddwd as

                        proposed in <a

                          href="https://reviews.llvm.org/D49636"

                          target="_blank" moz-do-not-send="true">https://reviews.llvm.org/D49636</a>.

                        And for the future CPU the whole thing can be

                        matched to the new instruction.<o:p></o:p></p>

                    </div>

                    <div>

                      <p class="MsoNormal"><o:p> </o:p></p>

                    </div>

                    <div>

                      <p class="MsoNormal">Do other targets have a

                        similar instruction or a similar issue to this?

                        Is this something we can solve in the loop

                        vectorizer? Or should we have a separate IR

                        transformation that can recognize this pattern

                        and generate the new sequence? As a separate

                        pass we would need to pair two vector loads

                        together, remove a reduction step outside the

                        loop and remove half the phis assuming the loop

                        was partially unrolled. Or if there was only one

                        add/mul inside the loop we'd have to reduce its

                        width and the width of the phi.<o:p></o:p></p>

                    </div>

                  </div>

                </blockquote>

                <p class="MsoNormal"><br>

                  Can you explain how the desired code from the

                  vectorizer differs from the code that the vectorizer

                  produces if you add '#pragma clang loop

                  vectorize(enable) vectorize_width(16)'  above the

                  loop? I tried it in your godbolt example and the

                  generated code looks very similar to the icc-generated

                  code.<o:p></o:p></p>

              </div>

            </blockquote>

            <div>

              <p class="MsoNormal"><o:p> </o:p></p>

            </div>

            <div>

              <p class="MsoNormal">It's similar, but the vpxor %xmm0,

                %xmm0, %xmm0 is being unnecessarily carried across the

                loop. It's then redundantly added twice in the reduction

                after the loop despite it being 0. This happens because

                we basically tricked the backend into generating a

                256-bit vpmaddwd concated with a 256-bit zero vector

                going into a 512-bit vaddd before type legalization. The

                512-bit concat and vpaddd get split during type

                legalization, and the high half of the add gets constant

                folded away. I'm guessing we probably finished with 4

                vpxors before the loop but MachineCSE(or some other

                pass?) combined two of them when it figured out the loop

                didn't modify them.<o:p></o:p></p>

            </div>

            <div>

              <p class="MsoNormal"> <o:p></o:p></p>

            </div>

            <blockquote style="border:none;border-left:solid #CCCCCC

              1.0pt;padding:0in 0in 0in

              6.0pt;margin-left:4.8pt;margin-right:0in">

              <div>

                <p class="MsoNormal"><br>

                  Thanks again,<br>

                  Hal<br>

                  <br>

                  <br>

                  <o:p></o:p></p>

                <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">

                  <div>

                    <div>

                      <p class="MsoNormal"><o:p> </o:p></p>

                    </div>

                    <p class="MsoNormal">Thanks,<br clear="all">

                      <o:p></o:p></p>

                    <div>

                      <div>

                        <p class="MsoNormal">~Craig<o:p></o:p></p>

                      </div>

                    </div>

                  </div>

                </blockquote>

                <p class="MsoNormal"><br>

                  <br>

                  <o:p></o:p></p>

                <pre>-- <o:p></o:p></pre>

                <pre>Hal Finkel<o:p></o:p></pre>

                <pre>Lead, Compiler Technology and Programming Languages<o:p></o:p></pre>

                <pre>Leadership Computing Facility<o:p></o:p></pre>

                <pre>Argonne National Laboratory<o:p></o:p></pre>

              </div>

            </blockquote>

          </div>

        </div>

      </div>

    </blockquote>

    <br>

    <pre class="moz-signature" cols="72">-- 

Hal Finkel

Lead, Compiler Technology and Programming Languages

Leadership Computing Facility

Argonne National Laboratory</pre>

  </body>

</html>