<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  </head>
  <body>
    There is a problem that without this patch there are other
    benchmarks that have about 20-30% perf regression. I know the root
    cause already, the patch is in work, hope to prepare in 1-2 hours,
    polishing some final things.<br>
    <p><br>
    </p>
    <pre class="moz-signature" cols="72">-------------
Best regards,
Alexey Bataev</pre>
    <div class="moz-cite-prefix">5/21/2021 8:11 AM, Alexander Kornienko
      пишет:<br>
    </div>
    <blockquote type="cite"
cite="mid:CAOweq9JYFEOfjSpnccqGYTEaHWCqygaHPPtN4=C2x1EpHjE-nw@mail.gmail.com">
      <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
      <div dir="ltr">Given that the change causes performance
        regressions in the range of 20-30% for some benchmarks, would
        you agree to rollback the patch while you're working on a fix?
        It would be helpful for us, if there was a cleaner version in
        mainline in the meantime.
        <div><br>
        </div>
        <div>Thanks!</div>
      </div>
      <br>
      <div class="gmail_quote">
        <div dir="ltr" class="gmail_attr">On Thu, May 20, 2021 at 10:11
          PM Alexey.Bataev <<a href="mailto:a.bataev@outlook.com"
            moz-do-not-send="true">a.bataev@outlook.com</a>> wrote:<br>
        </div>
        <blockquote class="gmail_quote" style="margin:0px 0px 0px
          0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
          <div> Checked the regression, the fix is correct but I need to
            prepare another patch for better match detection in the
            vectorization tree. Hope to commit it in a day or two.<br>
            <pre cols="72">-------------
Best regards,
Alexey Bataev</pre>
            <div>5/20/2021 12:03 PM, Alexander Kornienko пишет:<br>
            </div>
            <blockquote type="cite">
              <div dir="ltr">We see performance regressions after this
                patch. A number of benchmarks regressed for more than
                10%. One example is the flops-6.c from the LLVM
                test-suite. An isolated test based on that benchmark:
                <div>
                  <div><br>
                  </div>
                  <div>$ cat flops-6.c</div>
                  <div>extern int printf (const char *__restrict
                    __format, ...);<br>
                    double T[36];<br>
                    double sa,sb,sc,sd,one,two;<br>
                    double four,piref;<br>
                    double scale;<br>
                    double A1 = -0.1666666666671334;<br>
                    double A2 = 0.833333333809067E-2;<br>
                    double A3 = 0.198412715551283E-3;<br>
                    double A4 = 0.27557589750762E-5;<br>
                    double A5 = 0.2507059876207E-7;<br>
                    double A6 = 0.164105986683E-9;<br>
                    double B1 = -0.4999999999982;<br>
                    double B2 = 0.4166666664651E-1;<br>
                    double B3 = -0.1388888805755E-2;<br>
                    double B4 = 0.24801428034E-4;<br>
                    double B5 = -0.2754213324E-6;<br>
                    double B6 = 0.20189405E-8;<br>
                    int main()<br>
                    {<br>
                       double s,u,v,w,x;<br>
                       long loops;<br>
                       register long i, m, n;<br>
                       printf("\n");<br>
                       printf("   FLOPS C Program (Double Precision),
                    V2.0 18 Dec 1992\n\n");<br>
                       loops = 15625;<br>
                       piref = 3.14159265358979324;<br>
                       one = 1.0;<br>
                       two = 2.0;<br>
                       four = 4.0;<br>
                       scale = one;<br>
                       printf("   Module     Error        RunTime    
                     MFLOPS\n");<br>
                       printf("                            (usec)\n");<br>
                       m = loops*10000;<br>
                       x = piref / ( four * (double)m );<br>
                       s = 0.0;<br>
                       v = 0.0;<br>
                       for( i = 1 ; i <= m-1 ; i++ )<br>
                       {<br>
                       u = (double)i * x;<br>
                       w = u * u;<br>
                       v = u *
                    ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);<br>
                       s = s +
                    v*(w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one);<br>
                       }<br>
                       u = piref / four;<br>
                       w = u * u;<br>
                       sa =
                    u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);<br>
                       sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one;<br>
                       sa = sa * sb;<br>
                       sa = x * ( sa + two * s ) / two;<br>
                       sb = 0.25;<br>
                       sc = sa - sb;<br>
                       printf("     6   %13.4lf  %10.4lf  %10.4lf\n",<br>
                              sc* 1e-30,<br>
                              0* 1e-30 ,<br>
                              0* 1e-30);<br>
                       return 0;<br>
                    }<br>
                  </div>
                  <div>$ clang-base -O3 -maes -m64 -mcx16 -msse4.2
                    -mpclmul '-mprefer-vector-width=128' flops-6.c -o
                    flops-6-base<br>
                  </div>
                  <div>$ clang-new -O3 -maes -m64 -mcx16 -msse4.2
                    -mpclmul '-mprefer-vector-width=128' flops-6.c -o
                    flops-6-new</div>
                  <div>$ for i in $(seq 5) ; do time ./flops-6-base ;
                    done<br>
                         6          0.0000      0.0000      0.0000<br>
                    <br>
                    real    0m0.705s<br>
                    user    0m0.700s<br>
                    sys     0m0.004s<br>
                         6          0.0000      0.0000      0.0000<br>
                    <br>
                    real    0m0.706s<br>
                    user    0m0.704s<br>
                    sys     0m0.001s<br>
                         6          0.0000      0.0000      0.0000<br>
                    <br>
                    real    0m0.706s<br>
                    user    0m0.705s<br>
                    sys     0m0.001s<br>
                         6          0.0000      0.0000      0.0000<br>
                    <br>
                    real    0m0.706s<br>
                    user    0m0.704s<br>
                    sys     0m0.001s<br>
                         6          0.0000      0.0000      0.0000<br>
                    <br>
                    real    0m0.707s<br>
                    user    0m0.705s<br>
                    sys     0m0.001s<br>
                    $ for i in $(seq 5) ; do time ./flops-6-new ; done<br>
                         6          0.0000      0.0000      0.0000<br>
                    <br>
                    real    0m0.899s<br>
                    user    0m0.898s<br>
                    sys     0m0.000s<br>
                         6          0.0000      0.0000      0.0000<br>
                    <br>
                    real    0m0.899s<br>
                    user    0m0.898s<br>
                    sys     0m0.000s<br>
                         6          0.0000      0.0000      0.0000<br>
                    <br>
                    real    0m0.900s<br>
                    user    0m0.899s<br>
                    sys     0m0.000s<br>
                         6          0.0000      0.0000      0.0000<br>
                    <br>
                    real    0m0.899s<br>
                    user    0m0.898s<br>
                    sys     0m0.000s<br>
                         6          0.0000      0.0000      0.0000<br>
                    <br>
                    real    0m0.899s<br>
                    user    0m0.898s<br>
                    sys     0m0.000s<br>
                  </div>
                </div>
                <div><br>
                </div>
                <div>Can you take a look at this and maybe revert in the
                  meantime?</div>
                <div><br>
                </div>
                <div>Thanks!</div>
                <div><br>
                </div>
                <div>-- Alex</div>
              </div>
              <br>
              <div class="gmail_quote">
                <div dir="ltr" class="gmail_attr">On Mon, May 10, 2021
                  at 4:10 PM Alexey Bataev via llvm-commits <<a
                    href="mailto:llvm-commits@lists.llvm.org"
                    target="_blank" moz-do-not-send="true">llvm-commits@lists.llvm.org</a>>
                  wrote:<br>
                </div>
                <blockquote class="gmail_quote" style="margin:0px 0px
                  0px 0.8ex;border-left:1px solid
                  rgb(204,204,204);padding-left:1ex"><br>
                  Author: Alexey Bataev<br>
                  Date: 2021-05-10T07:08:07-07:00<br>
                  New Revision: 30463bc3f1839e8a238be4c137e2356f3cca2771<br>
                  <br>
                  URL: <a
href="https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771"
                    rel="noreferrer" target="_blank"
                    moz-do-not-send="true">https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771</a><br>
                  DIFF: <a
href="https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771.diff"
                    rel="noreferrer" target="_blank"
                    moz-do-not-send="true">https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771.diff</a><br>
                  <br>
                  LOG: [SLP]Do not count perfect diamond matches for
                  gathers several times.<br>
                  <br>
                  Need to remove the old code for avoiding double
                  counting of the gather<br>
                  nodes with perfect diamond matches within the tree
                  after we started<br>
                  detecting perfect/shuffled matching in the previous
                  patch D100495. We<br>
                  may skip the cost for such nodes completely.<br>
                  <br>
                  Differential Revision: <a
                    href="https://reviews.llvm.org/D102023"
                    rel="noreferrer" target="_blank"
                    moz-do-not-send="true">https://reviews.llvm.org/D102023</a><br>
                  <br>
                  Added: <br>
                  <br>
                  <br>
                  Modified: <br>
                      llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>
                     
                  llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>
                  <br>
                  Removed: <br>
                  <br>
                  <br>
                  <br>
################################################################################<br>
                  diff  --git
                  a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
                  b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>
                  index 22e090fd1d7c..e656b189c779 100644<br>
                  --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>
                  +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>
                  @@ -4233,27 +4233,6 @@ InstructionCost
                  BoUpSLP::getTreeCost() {<br>
                     for (unsigned I = 0, E = VectorizableTree.size(); I
                  < E; ++I) {<br>
                       TreeEntry &TE = *VectorizableTree[I].get();<br>
                  <br>
                  -    // We create duplicate tree entries for gather
                  sequences that have multiple<br>
                  -    // uses. However, we should not compute the cost
                  of duplicate sequences.<br>
                  -    // For example, if we have a build vector (i.e.,
                  insertelement sequence)<br>
                  -    // that is used by more than one vector
                  instruction, we only need to<br>
                  -    // compute the cost of the insertelement
                  instructions once. The redundant<br>
                  -    // instructions will be eliminated by CSE.<br>
                  -    //<br>
                  -    // We should consider not creating duplicate tree
                  entries for gather<br>
                  -    // sequences, and instead add additional edges to
                  the tree representing<br>
                  -    // their uses. Since such an approach results in
                  fewer total entries,<br>
                  -    // existing heuristics based on tree size may
                  yield <br>
                  diff erent results.<br>
                  -    //<br>
                  -    if (TE.State == TreeEntry::NeedToGather
                  &&<br>
                  -       
                  std::any_of(std::next(VectorizableTree.begin(), I +
                  1),<br>
                  -                    VectorizableTree.end(),<br>
                  -                    [TE](const
                  std::unique_ptr<TreeEntry> &EntryPtr) {<br>
                  -                      return EntryPtr->State ==
                  TreeEntry::NeedToGather &&<br>
                  -                           
                   EntryPtr->isSame(TE.Scalars);<br>
                  -                    }))<br>
                  -      continue;<br>
                  -<br>
                       InstructionCost C = getEntryCost(&TE);<br>
                       Cost += C;<br>
                       LLVM_DEBUG(dbgs() << "SLP: Adding cost "
                  << C<br>
                  <br>
                  diff  --git
                  a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>
                  index 31c63d31f4df..57db62ace206 100644<br>
                  ---
                  a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>
                  +++
                  b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>
                  @@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu"<br>
                   ; REMARK-LABEL: Function: gather_multiple_use<br>
                   ; REMARK:       Args:<br>
                   ; REMARK-NEXT:    - String: 'Vectorized horizontal
                  reduction with cost '<br>
                  -; REMARK-NEXT:    - Cost: '-16'<br>
                  +; REMARK-NEXT:    - Cost: '-7'<br>
                   ;<br>
                   ; REMARK-NOT: Function: gather_load<br>
                  <br>
                  <br>
                  <br>
                  <br>
                  _______________________________________________<br>
                  llvm-commits mailing list<br>
                  <a href="mailto:llvm-commits@lists.llvm.org"
                    target="_blank" moz-do-not-send="true">llvm-commits@lists.llvm.org</a><br>
                  <a
                    href="https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits"
                    rel="noreferrer" target="_blank"
                    moz-do-not-send="true">https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits</a><br>
                </blockquote>
              </div>
            </blockquote>
          </div>
        </blockquote>
      </div>
    </blockquote>
  </body>
</html>