<html>

  <head>

    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">

  </head>

  <body>

    There is a problem that without this patch there are other

    benchmarks that have about 20-30% perf regression. I know the root

    cause already, the patch is in work, hope to prepare in 1-2 hours,

    polishing some final things.<br>

    <p><br>

    </p>

    <pre class="moz-signature" cols="72">-------------

Best regards,

Alexey Bataev</pre>

    <div class="moz-cite-prefix">5/21/2021 8:11 AM, Alexander Kornienko

      пишет:<br>

    </div>

    <blockquote type="cite"

cite="mid:CAOweq9JYFEOfjSpnccqGYTEaHWCqygaHPPtN4=C2x1EpHjE-nw@mail.gmail.com">

      <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">

      <div dir="ltr">Given that the change causes performance

        regressions in the range of 20-30% for some benchmarks, would

        you agree to rollback the patch while you're working on a fix?

        It would be helpful for us, if there was a cleaner version in

        mainline in the meantime.

        <div><br>

        </div>

        <div>Thanks!</div>

      </div>

      <br>

      <div class="gmail_quote">

        <div dir="ltr" class="gmail_attr">On Thu, May 20, 2021 at 10:11

          PM Alexey.Bataev <<a href="mailto:a.bataev@outlook.com"

            moz-do-not-send="true">a.bataev@outlook.com</a>> wrote:<br>

        </div>

        <blockquote class="gmail_quote" style="margin:0px 0px 0px

          0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">

          <div> Checked the regression, the fix is correct but I need to

            prepare another patch for better match detection in the

            vectorization tree. Hope to commit it in a day or two.<br>

            <pre cols="72">-------------

Best regards,

Alexey Bataev</pre>

            <div>5/20/2021 12:03 PM, Alexander Kornienko пишет:<br>

            </div>

            <blockquote type="cite">

              <div dir="ltr">We see performance regressions after this

                patch. A number of benchmarks regressed for more than

                10%. One example is the flops-6.c from the LLVM

                test-suite. An isolated test based on that benchmark:

                <div>

                  <div><br>

                  </div>

                  <div>$ cat flops-6.c</div>

                  <div>extern int printf (const char *__restrict

                    __format, ...);<br>

                    double T[36];<br>

                    double sa,sb,sc,sd,one,two;<br>

                    double four,piref;<br>

                    double scale;<br>

                    double A1 = -0.1666666666671334;<br>

                    double A2 = 0.833333333809067E-2;<br>

                    double A3 = 0.198412715551283E-3;<br>

                    double A4 = 0.27557589750762E-5;<br>

                    double A5 = 0.2507059876207E-7;<br>

                    double A6 = 0.164105986683E-9;<br>

                    double B1 = -0.4999999999982;<br>

                    double B2 = 0.4166666664651E-1;<br>

                    double B3 = -0.1388888805755E-2;<br>

                    double B4 = 0.24801428034E-4;<br>

                    double B5 = -0.2754213324E-6;<br>

                    double B6 = 0.20189405E-8;<br>

                    int main()<br>

                    {<br>

                       double s,u,v,w,x;<br>

                       long loops;<br>

                       register long i, m, n;<br>

                       printf("\n");<br>

                       printf("   FLOPS C Program (Double Precision),

                    V2.0 18 Dec 1992\n\n");<br>

                       loops = 15625;<br>

                       piref = 3.14159265358979324;<br>

                       one = 1.0;<br>

                       two = 2.0;<br>

                       four = 4.0;<br>

                       scale = one;<br>

                       printf("   Module     Error        RunTime    

                     MFLOPS\n");<br>

                       printf("                            (usec)\n");<br>

                       m = loops*10000;<br>

                       x = piref / ( four * (double)m );<br>

                       s = 0.0;<br>

                       v = 0.0;<br>

                       for( i = 1 ; i <= m-1 ; i++ )<br>

                       {<br>

                       u = (double)i * x;<br>

                       w = u * u;<br>

                       v = u *

                    ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);<br>

                       s = s +

                    v*(w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one);<br>

                       }<br>

                       u = piref / four;<br>

                       w = u * u;<br>

                       sa =

                    u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);<br>

                       sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one;<br>

                       sa = sa * sb;<br>

                       sa = x * ( sa + two * s ) / two;<br>

                       sb = 0.25;<br>

                       sc = sa - sb;<br>

                       printf("     6   %13.4lf  %10.4lf  %10.4lf\n",<br>

                              sc* 1e-30,<br>

                              0* 1e-30 ,<br>

                              0* 1e-30);<br>

                       return 0;<br>

                    }<br>

                  </div>

                  <div>$ clang-base -O3 -maes -m64 -mcx16 -msse4.2

                    -mpclmul '-mprefer-vector-width=128' flops-6.c -o

                    flops-6-base<br>

                  </div>

                  <div>$ clang-new -O3 -maes -m64 -mcx16 -msse4.2

                    -mpclmul '-mprefer-vector-width=128' flops-6.c -o

                    flops-6-new</div>

                  <div>$ for i in $(seq 5) ; do time ./flops-6-base ;

                    done<br>

                         6          0.0000      0.0000      0.0000<br>

                    <br>

                    real    0m0.705s<br>

                    user    0m0.700s<br>

                    sys     0m0.004s<br>

                         6          0.0000      0.0000      0.0000<br>

                    <br>

                    real    0m0.706s<br>

                    user    0m0.704s<br>

                    sys     0m0.001s<br>

                         6          0.0000      0.0000      0.0000<br>

                    <br>

                    real    0m0.706s<br>

                    user    0m0.705s<br>

                    sys     0m0.001s<br>

                         6          0.0000      0.0000      0.0000<br>

                    <br>

                    real    0m0.706s<br>

                    user    0m0.704s<br>

                    sys     0m0.001s<br>

                         6          0.0000      0.0000      0.0000<br>

                    <br>

                    real    0m0.707s<br>

                    user    0m0.705s<br>

                    sys     0m0.001s<br>

                    $ for i in $(seq 5) ; do time ./flops-6-new ; done<br>

                         6          0.0000      0.0000      0.0000<br>

                    <br>

                    real    0m0.899s<br>

                    user    0m0.898s<br>

                    sys     0m0.000s<br>

                         6          0.0000      0.0000      0.0000<br>

                    <br>

                    real    0m0.899s<br>

                    user    0m0.898s<br>

                    sys     0m0.000s<br>

                         6          0.0000      0.0000      0.0000<br>

                    <br>

                    real    0m0.900s<br>

                    user    0m0.899s<br>

                    sys     0m0.000s<br>

                         6          0.0000      0.0000      0.0000<br>

                    <br>

                    real    0m0.899s<br>

                    user    0m0.898s<br>

                    sys     0m0.000s<br>

                         6          0.0000      0.0000      0.0000<br>

                    <br>

                    real    0m0.899s<br>

                    user    0m0.898s<br>

                    sys     0m0.000s<br>

                  </div>

                </div>

                <div><br>

                </div>

                <div>Can you take a look at this and maybe revert in the

                  meantime?</div>

                <div><br>

                </div>

                <div>Thanks!</div>

                <div><br>

                </div>

                <div>-- Alex</div>

              </div>

              <br>

              <div class="gmail_quote">

                <div dir="ltr" class="gmail_attr">On Mon, May 10, 2021

                  at 4:10 PM Alexey Bataev via llvm-commits <<a

                    href="mailto:llvm-commits@lists.llvm.org"

                    target="_blank" moz-do-not-send="true">llvm-commits@lists.llvm.org</a>>

                  wrote:<br>

                </div>

                <blockquote class="gmail_quote" style="margin:0px 0px

                  0px 0.8ex;border-left:1px solid

                  rgb(204,204,204);padding-left:1ex"><br>

                  Author: Alexey Bataev<br>

                  Date: 2021-05-10T07:08:07-07:00<br>

                  New Revision: 30463bc3f1839e8a238be4c137e2356f3cca2771<br>

                  <br>

                  URL: <a

href="https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771"

                    rel="noreferrer" target="_blank"

                    moz-do-not-send="true">https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771</a><br>

                  DIFF: <a

href="https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771.diff"

                    rel="noreferrer" target="_blank"

                    moz-do-not-send="true">https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771.diff</a><br>

                  <br>

                  LOG: [SLP]Do not count perfect diamond matches for

                  gathers several times.<br>

                  <br>

                  Need to remove the old code for avoiding double

                  counting of the gather<br>

                  nodes with perfect diamond matches within the tree

                  after we started<br>

                  detecting perfect/shuffled matching in the previous

                  patch D100495. We<br>

                  may skip the cost for such nodes completely.<br>

                  <br>

                  Differential Revision: <a

                    href="https://reviews.llvm.org/D102023"

                    rel="noreferrer" target="_blank"

                    moz-do-not-send="true">https://reviews.llvm.org/D102023</a><br>

                  <br>

                  Added: <br>

                  <br>

                  <br>

                  Modified: <br>

                      llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

                  llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>

                  <br>

                  Removed: <br>

                  <br>

                  <br>

                  <br>

################################################################################<br>

                  diff  --git

                  a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

                  b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

                  index 22e090fd1d7c..e656b189c779 100644<br>

                  --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

                  +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

                  @@ -4233,27 +4233,6 @@ InstructionCost

                  BoUpSLP::getTreeCost() {<br>

                     for (unsigned I = 0, E = VectorizableTree.size(); I

                  < E; ++I) {<br>

                       TreeEntry &TE = *VectorizableTree[I].get();<br>

                  <br>

                  -    // We create duplicate tree entries for gather

                  sequences that have multiple<br>

                  -    // uses. However, we should not compute the cost

                  of duplicate sequences.<br>

                  -    // For example, if we have a build vector (i.e.,

                  insertelement sequence)<br>

                  -    // that is used by more than one vector

                  instruction, we only need to<br>

                  -    // compute the cost of the insertelement

                  instructions once. The redundant<br>

                  -    // instructions will be eliminated by CSE.<br>

                  -    //<br>

                  -    // We should consider not creating duplicate tree

                  entries for gather<br>

                  -    // sequences, and instead add additional edges to

                  the tree representing<br>

                  -    // their uses. Since such an approach results in

                  fewer total entries,<br>

                  -    // existing heuristics based on tree size may

                  yield <br>

                  diff erent results.<br>

                  -    //<br>

                  -    if (TE.State == TreeEntry::NeedToGather

                  &&<br>

                  -       

                  std::any_of(std::next(VectorizableTree.begin(), I +

                  1),<br>

                  -                    VectorizableTree.end(),<br>

                  -                    [TE](const

                  std::unique_ptr<TreeEntry> &EntryPtr) {<br>

                  -                      return EntryPtr->State ==

                  TreeEntry::NeedToGather &&<br>

                  -                           

                   EntryPtr->isSame(TE.Scalars);<br>

                  -                    }))<br>

                  -      continue;<br>

                  -<br>

                       InstructionCost C = getEntryCost(&TE);<br>

                       Cost += C;<br>

                       LLVM_DEBUG(dbgs() << "SLP: Adding cost "

                  << C<br>

                  <br>

                  diff  --git

                  a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll

b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>

                  index 31c63d31f4df..57db62ace206 100644<br>

                  ---

                  a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>

                  +++

                  b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>

                  @@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu"<br>

                   ; REMARK-LABEL: Function: gather_multiple_use<br>

                   ; REMARK:       Args:<br>

                   ; REMARK-NEXT:    - String: 'Vectorized horizontal

                  reduction with cost '<br>

                  -; REMARK-NEXT:    - Cost: '-16'<br>

                  +; REMARK-NEXT:    - Cost: '-7'<br>

                   ;<br>

                   ; REMARK-NOT: Function: gather_load<br>

                  <br>

                  <br>

                  <br>

                  <br>

                  _______________________________________________<br>

                  llvm-commits mailing list<br>

                  <a href="mailto:llvm-commits@lists.llvm.org"

                    target="_blank" moz-do-not-send="true">llvm-commits@lists.llvm.org</a><br>

                  <a

                    href="https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits"

                    rel="noreferrer" target="_blank"

                    moz-do-not-send="true">https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits</a><br>

                </blockquote>

              </div>

            </blockquote>

          </div>

        </blockquote>

      </div>

    </blockquote>

  </body>

</html>