<div dir="ltr">Given that the change causes performance regressions in the range of 20-30% for some benchmarks, would you agree to rollback the patch while you're working on a fix? It would be helpful for us, if there was a cleaner version in mainline in the meantime.<div><br></div><div>Thanks!</div></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Thu, May 20, 2021 at 10:11 PM Alexey.Bataev <<a href="mailto:a.bataev@outlook.com">a.bataev@outlook.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
  
    
  
  <div>
    Checked the regression, the fix is correct but I need to prepare
    another patch for better match detection in the vectorization tree.
    Hope to commit it in a day or two.<br>
    <pre cols="72">-------------
Best regards,
Alexey Bataev</pre>
    <div>5/20/2021 12:03 PM, Alexander Kornienko
      пишет:<br>
    </div>
    <blockquote type="cite">
      
      <div dir="ltr">We see performance regressions after this patch. A
        number of benchmarks regressed for more than 10%. One example is
        the flops-6.c from the LLVM test-suite. An isolated test based
        on that benchmark:
        <div>
          <div><br>
          </div>
          <div>$ cat flops-6.c</div>
          <div>extern int printf (const char *__restrict __format, ...);<br>
            double T[36];<br>
            double sa,sb,sc,sd,one,two;<br>
            double four,piref;<br>
            double scale;<br>
            double A1 = -0.1666666666671334;<br>
            double A2 = 0.833333333809067E-2;<br>
            double A3 = 0.198412715551283E-3;<br>
            double A4 = 0.27557589750762E-5;<br>
            double A5 = 0.2507059876207E-7;<br>
            double A6 = 0.164105986683E-9;<br>
            double B1 = -0.4999999999982;<br>
            double B2 = 0.4166666664651E-1;<br>
            double B3 = -0.1388888805755E-2;<br>
            double B4 = 0.24801428034E-4;<br>
            double B5 = -0.2754213324E-6;<br>
            double B6 = 0.20189405E-8;<br>
            int main()<br>
            {<br>
               double s,u,v,w,x;<br>
               long loops;<br>
               register long i, m, n;<br>
               printf("\n");<br>
               printf("   FLOPS C Program (Double Precision), V2.0 18
            Dec 1992\n\n");<br>
               loops = 15625;<br>
               piref = 3.14159265358979324;<br>
               one = 1.0;<br>
               two = 2.0;<br>
               four = 4.0;<br>
               scale = one;<br>
               printf("   Module     Error        RunTime    
             MFLOPS\n");<br>
               printf("                            (usec)\n");<br>
               m = loops*10000;<br>
               x = piref / ( four * (double)m );<br>
               s = 0.0;<br>
               v = 0.0;<br>
               for( i = 1 ; i <= m-1 ; i++ )<br>
               {<br>
               u = (double)i * x;<br>
               w = u * u;<br>
               v = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);<br>
               s = s + v*(w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one);<br>
               }<br>
               u = piref / four;<br>
               w = u * u;<br>
               sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);<br>
               sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one;<br>
               sa = sa * sb;<br>
               sa = x * ( sa + two * s ) / two;<br>
               sb = 0.25;<br>
               sc = sa - sb;<br>
               printf("     6   %13.4lf  %10.4lf  %10.4lf\n",<br>
                      sc* 1e-30,<br>
                      0* 1e-30 ,<br>
                      0* 1e-30);<br>
               return 0;<br>
            }<br>
          </div>
          <div>$ clang-base -O3 -maes -m64 -mcx16 -msse4.2 -mpclmul
            '-mprefer-vector-width=128' flops-6.c -o flops-6-base<br>
          </div>
          <div>$ clang-new -O3 -maes -m64 -mcx16 -msse4.2 -mpclmul
            '-mprefer-vector-width=128' flops-6.c -o flops-6-new</div>
          <div>$ for i in $(seq 5) ; do time ./flops-6-base ; done<br>
                 6          0.0000      0.0000      0.0000<br>
            <br>
            real    0m0.705s<br>
            user    0m0.700s<br>
            sys     0m0.004s<br>
                 6          0.0000      0.0000      0.0000<br>
            <br>
            real    0m0.706s<br>
            user    0m0.704s<br>
            sys     0m0.001s<br>
                 6          0.0000      0.0000      0.0000<br>
            <br>
            real    0m0.706s<br>
            user    0m0.705s<br>
            sys     0m0.001s<br>
                 6          0.0000      0.0000      0.0000<br>
            <br>
            real    0m0.706s<br>
            user    0m0.704s<br>
            sys     0m0.001s<br>
                 6          0.0000      0.0000      0.0000<br>
            <br>
            real    0m0.707s<br>
            user    0m0.705s<br>
            sys     0m0.001s<br>
            $ for i in $(seq 5) ; do time ./flops-6-new ; done<br>
                 6          0.0000      0.0000      0.0000<br>
            <br>
            real    0m0.899s<br>
            user    0m0.898s<br>
            sys     0m0.000s<br>
                 6          0.0000      0.0000      0.0000<br>
            <br>
            real    0m0.899s<br>
            user    0m0.898s<br>
            sys     0m0.000s<br>
                 6          0.0000      0.0000      0.0000<br>
            <br>
            real    0m0.900s<br>
            user    0m0.899s<br>
            sys     0m0.000s<br>
                 6          0.0000      0.0000      0.0000<br>
            <br>
            real    0m0.899s<br>
            user    0m0.898s<br>
            sys     0m0.000s<br>
                 6          0.0000      0.0000      0.0000<br>
            <br>
            real    0m0.899s<br>
            user    0m0.898s<br>
            sys     0m0.000s<br>
          </div>
        </div>
        <div><br>
        </div>
        <div>Can you take a look at this and maybe revert in the
          meantime?</div>
        <div><br>
        </div>
        <div>Thanks!</div>
        <div><br>
        </div>
        <div>-- Alex</div>
      </div>
      <br>
      <div class="gmail_quote">
        <div dir="ltr" class="gmail_attr">On Mon, May 10, 2021 at 4:10
          PM Alexey Bataev via llvm-commits <<a href="mailto:llvm-commits@lists.llvm.org" target="_blank">llvm-commits@lists.llvm.org</a>>
          wrote:<br>
        </div>
        <blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><br>
          Author: Alexey Bataev<br>
          Date: 2021-05-10T07:08:07-07:00<br>
          New Revision: 30463bc3f1839e8a238be4c137e2356f3cca2771<br>
          <br>
          URL: <a href="https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771" rel="noreferrer" target="_blank">https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771</a><br>
          DIFF: <a href="https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771.diff" rel="noreferrer" target="_blank">https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771.diff</a><br>
          <br>
          LOG: [SLP]Do not count perfect diamond matches for gathers
          several times.<br>
          <br>
          Need to remove the old code for avoiding double counting of
          the gather<br>
          nodes with perfect diamond matches within the tree after we
          started<br>
          detecting perfect/shuffled matching in the previous patch
          D100495. We<br>
          may skip the cost for such nodes completely.<br>
          <br>
          Differential Revision: <a href="https://reviews.llvm.org/D102023" rel="noreferrer" target="_blank">https://reviews.llvm.org/D102023</a><br>
          <br>
          Added: <br>
          <br>
          <br>
          Modified: <br>
              llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>
              llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>
          <br>
          Removed: <br>
          <br>
          <br>
          <br>
################################################################################<br>
          diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
          b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>
          index 22e090fd1d7c..e656b189c779 100644<br>
          --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>
          +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>
          @@ -4233,27 +4233,6 @@ InstructionCost BoUpSLP::getTreeCost()
          {<br>
             for (unsigned I = 0, E = VectorizableTree.size(); I < E;
          ++I) {<br>
               TreeEntry &TE = *VectorizableTree[I].get();<br>
          <br>
          -    // We create duplicate tree entries for gather sequences
          that have multiple<br>
          -    // uses. However, we should not compute the cost of
          duplicate sequences.<br>
          -    // For example, if we have a build vector (i.e.,
          insertelement sequence)<br>
          -    // that is used by more than one vector instruction, we
          only need to<br>
          -    // compute the cost of the insertelement instructions
          once. The redundant<br>
          -    // instructions will be eliminated by CSE.<br>
          -    //<br>
          -    // We should consider not creating duplicate tree entries
          for gather<br>
          -    // sequences, and instead add additional edges to the
          tree representing<br>
          -    // their uses. Since such an approach results in fewer
          total entries,<br>
          -    // existing heuristics based on tree size may yield <br>
          diff erent results.<br>
          -    //<br>
          -    if (TE.State == TreeEntry::NeedToGather &&<br>
          -        std::any_of(std::next(VectorizableTree.begin(), I +
          1),<br>
          -                    VectorizableTree.end(),<br>
          -                    [TE](const
          std::unique_ptr<TreeEntry> &EntryPtr) {<br>
          -                      return EntryPtr->State ==
          TreeEntry::NeedToGather &&<br>
          -                             EntryPtr->isSame(TE.Scalars);<br>
          -                    }))<br>
          -      continue;<br>
          -<br>
               InstructionCost C = getEntryCost(&TE);<br>
               Cost += C;<br>
               LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C<br>
          <br>
          diff  --git
          a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
          b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>
          index 31c63d31f4df..57db62ace206 100644<br>
          ---
          a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>
          +++
          b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>
          @@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu"<br>
           ; REMARK-LABEL: Function: gather_multiple_use<br>
           ; REMARK:       Args:<br>
           ; REMARK-NEXT:    - String: 'Vectorized horizontal reduction
          with cost '<br>
          -; REMARK-NEXT:    - Cost: '-16'<br>
          +; REMARK-NEXT:    - Cost: '-7'<br>
           ;<br>
           ; REMARK-NOT: Function: gather_load<br>
          <br>
          <br>
          <br>
          <br>
          _______________________________________________<br>
          llvm-commits mailing list<br>
          <a href="mailto:llvm-commits@lists.llvm.org" target="_blank">llvm-commits@lists.llvm.org</a><br>
          <a href="https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits</a><br>
        </blockquote>
      </div>
    </blockquote>
  </div>

</blockquote></div>