<div dir="ltr">Given that the change causes performance regressions in the range of 20-30% for some benchmarks, would you agree to rollback the patch while you're working on a fix? It would be helpful for us, if there was a cleaner version in mainline in the meantime.<div><br></div><div>Thanks!</div></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Thu, May 20, 2021 at 10:11 PM Alexey.Bataev <<a href="mailto:a.bataev@outlook.com">a.bataev@outlook.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">

  <div>

    Checked the regression, the fix is correct but I need to prepare

    another patch for better match detection in the vectorization tree.

    Hope to commit it in a day or two.<br>

    <pre cols="72">-------------

Best regards,

Alexey Bataev</pre>

    <div>5/20/2021 12:03 PM, Alexander Kornienko

      пишет:<br>

    </div>

    <blockquote type="cite">

      <div dir="ltr">We see performance regressions after this patch. A

        number of benchmarks regressed for more than 10%. One example is

        the flops-6.c from the LLVM test-suite. An isolated test based

        on that benchmark:

        <div>

          <div><br>

          </div>

          <div>$ cat flops-6.c</div>

          <div>extern int printf (const char *__restrict __format, ...);<br>

            double T[36];<br>

            double sa,sb,sc,sd,one,two;<br>

            double four,piref;<br>

            double scale;<br>

            double A1 = -0.1666666666671334;<br>

            double A2 = 0.833333333809067E-2;<br>

            double A3 = 0.198412715551283E-3;<br>

            double A4 = 0.27557589750762E-5;<br>

            double A5 = 0.2507059876207E-7;<br>

            double A6 = 0.164105986683E-9;<br>

            double B1 = -0.4999999999982;<br>

            double B2 = 0.4166666664651E-1;<br>

            double B3 = -0.1388888805755E-2;<br>

            double B4 = 0.24801428034E-4;<br>

            double B5 = -0.2754213324E-6;<br>

            double B6 = 0.20189405E-8;<br>

            int main()<br>

            {<br>

               double s,u,v,w,x;<br>

               long loops;<br>

               register long i, m, n;<br>

               printf("\n");<br>

               printf("   FLOPS C Program (Double Precision), V2.0 18

            Dec 1992\n\n");<br>

               loops = 15625;<br>

               piref = 3.14159265358979324;<br>

               one = 1.0;<br>

               two = 2.0;<br>

               four = 4.0;<br>

               scale = one;<br>

               printf("   Module     Error        RunTime    

             MFLOPS\n");<br>

               printf("                            (usec)\n");<br>

               m = loops*10000;<br>

               x = piref / ( four * (double)m );<br>

               s = 0.0;<br>

               v = 0.0;<br>

               for( i = 1 ; i <= m-1 ; i++ )<br>

               {<br>

               u = (double)i * x;<br>

               w = u * u;<br>

               v = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);<br>

               s = s + v*(w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one);<br>

               }<br>

               u = piref / four;<br>

               w = u * u;<br>

               sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one);<br>

               sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one;<br>

               sa = sa * sb;<br>

               sa = x * ( sa + two * s ) / two;<br>

               sb = 0.25;<br>

               sc = sa - sb;<br>

               printf("     6   %13.4lf  %10.4lf  %10.4lf\n",<br>

                      sc* 1e-30,<br>

                      0* 1e-30 ,<br>

                      0* 1e-30);<br>

               return 0;<br>

            }<br>

          </div>

          <div>$ clang-base -O3 -maes -m64 -mcx16 -msse4.2 -mpclmul

            '-mprefer-vector-width=128' flops-6.c -o flops-6-base<br>

          </div>

          <div>$ clang-new -O3 -maes -m64 -mcx16 -msse4.2 -mpclmul

            '-mprefer-vector-width=128' flops-6.c -o flops-6-new</div>

          <div>$ for i in $(seq 5) ; do time ./flops-6-base ; done<br>

                 6          0.0000      0.0000      0.0000<br>

            <br>

            real    0m0.705s<br>

            user    0m0.700s<br>

            sys     0m0.004s<br>

                 6          0.0000      0.0000      0.0000<br>

            <br>

            real    0m0.706s<br>

            user    0m0.704s<br>

            sys     0m0.001s<br>

                 6          0.0000      0.0000      0.0000<br>

            <br>

            real    0m0.706s<br>

            user    0m0.705s<br>

            sys     0m0.001s<br>

                 6          0.0000      0.0000      0.0000<br>

            <br>

            real    0m0.706s<br>

            user    0m0.704s<br>

            sys     0m0.001s<br>

                 6          0.0000      0.0000      0.0000<br>

            <br>

            real    0m0.707s<br>

            user    0m0.705s<br>

            sys     0m0.001s<br>

            $ for i in $(seq 5) ; do time ./flops-6-new ; done<br>

                 6          0.0000      0.0000      0.0000<br>

            <br>

            real    0m0.899s<br>

            user    0m0.898s<br>

            sys     0m0.000s<br>

                 6          0.0000      0.0000      0.0000<br>

            <br>

            real    0m0.899s<br>

            user    0m0.898s<br>

            sys     0m0.000s<br>

                 6          0.0000      0.0000      0.0000<br>

            <br>

            real    0m0.900s<br>

            user    0m0.899s<br>

            sys     0m0.000s<br>

                 6          0.0000      0.0000      0.0000<br>

            <br>

            real    0m0.899s<br>

            user    0m0.898s<br>

            sys     0m0.000s<br>

                 6          0.0000      0.0000      0.0000<br>

            <br>

            real    0m0.899s<br>

            user    0m0.898s<br>

            sys     0m0.000s<br>

          </div>

        </div>

        <div><br>

        </div>

        <div>Can you take a look at this and maybe revert in the

          meantime?</div>

        <div><br>

        </div>

        <div>Thanks!</div>

        <div><br>

        </div>

        <div>-- Alex</div>

      </div>

      <br>

      <div class="gmail_quote">

        <div dir="ltr" class="gmail_attr">On Mon, May 10, 2021 at 4:10

          PM Alexey Bataev via llvm-commits <<a href="mailto:llvm-commits@lists.llvm.org" target="_blank">llvm-commits@lists.llvm.org</a>>

          wrote:<br>

        </div>

        <blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><br>

          Author: Alexey Bataev<br>

          Date: 2021-05-10T07:08:07-07:00<br>

          New Revision: 30463bc3f1839e8a238be4c137e2356f3cca2771<br>

          <br>

          URL: <a href="https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771" rel="noreferrer" target="_blank">https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771</a><br>

          DIFF: <a href="https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771.diff" rel="noreferrer" target="_blank">https://github.com/llvm/llvm-project/commit/30463bc3f1839e8a238be4c137e2356f3cca2771.diff</a><br>

          <br>

          LOG: [SLP]Do not count perfect diamond matches for gathers

          several times.<br>

          <br>

          Need to remove the old code for avoiding double counting of

          the gather<br>

          nodes with perfect diamond matches within the tree after we

          started<br>

          detecting perfect/shuffled matching in the previous patch

          D100495. We<br>

          may skip the cost for such nodes completely.<br>

          <br>

          Differential Revision: <a href="https://reviews.llvm.org/D102023" rel="noreferrer" target="_blank">https://reviews.llvm.org/D102023</a><br>

          <br>

          Added: <br>

          <br>

          <br>

          Modified: <br>

              llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

              llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>

          <br>

          Removed: <br>

          <br>

          <br>

          <br>

################################################################################<br>

          diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

          b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

          index 22e090fd1d7c..e656b189c779 100644<br>

          --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

          +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

          @@ -4233,27 +4233,6 @@ InstructionCost BoUpSLP::getTreeCost()

          {<br>

             for (unsigned I = 0, E = VectorizableTree.size(); I < E;

          ++I) {<br>

               TreeEntry &TE = *VectorizableTree[I].get();<br>

          <br>

          -    // We create duplicate tree entries for gather sequences

          that have multiple<br>

          -    // uses. However, we should not compute the cost of

          duplicate sequences.<br>

          -    // For example, if we have a build vector (i.e.,

          insertelement sequence)<br>

          -    // that is used by more than one vector instruction, we

          only need to<br>

          -    // compute the cost of the insertelement instructions

          once. The redundant<br>

          -    // instructions will be eliminated by CSE.<br>

          -    //<br>

          -    // We should consider not creating duplicate tree entries

          for gather<br>

          -    // sequences, and instead add additional edges to the

          tree representing<br>

          -    // their uses. Since such an approach results in fewer

          total entries,<br>

          -    // existing heuristics based on tree size may yield <br>

          diff erent results.<br>

          -    //<br>

          -    if (TE.State == TreeEntry::NeedToGather &&<br>

          -        std::any_of(std::next(VectorizableTree.begin(), I +

          1),<br>

          -                    VectorizableTree.end(),<br>

          -                    [TE](const

          std::unique_ptr<TreeEntry> &EntryPtr) {<br>

          -                      return EntryPtr->State ==

          TreeEntry::NeedToGather &&<br>

          -                             EntryPtr->isSame(TE.Scalars);<br>

          -                    }))<br>

          -      continue;<br>

          -<br>

               InstructionCost C = getEntryCost(&TE);<br>

               Cost += C;<br>

               LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C<br>

          <br>

          diff  --git

          a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll

          b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>

          index 31c63d31f4df..57db62ace206 100644<br>

          ---

          a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>

          +++

          b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll<br>

          @@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu"<br>

           ; REMARK-LABEL: Function: gather_multiple_use<br>

           ; REMARK:       Args:<br>

           ; REMARK-NEXT:    - String: 'Vectorized horizontal reduction

          with cost '<br>

          -; REMARK-NEXT:    - Cost: '-16'<br>

          +; REMARK-NEXT:    - Cost: '-7'<br>

           ;<br>

           ; REMARK-NOT: Function: gather_load<br>

          <br>

          <br>

          <br>

          <br>

          _______________________________________________<br>

          llvm-commits mailing list<br>

          <a href="mailto:llvm-commits@lists.llvm.org" target="_blank">llvm-commits@lists.llvm.org</a><br>

          <a href="https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits</a><br>

        </blockquote>

      </div>

    </blockquote>

  </div>

</blockquote></div>