<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - [ppc] LLVM built 470.lbm is 9.5% slower than gcc on power8"

   href="https://llvm.org/bugs/show_bug.cgi?id=25219">25219</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[ppc] LLVM built 470.lbm is 9.5% slower than gcc on power8

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: PowerPC

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>carrot@google.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>The following compiler options are used

-fno-strict-aliasing -O2 -m64 -mvsx -mcpu=power8 -ffp-contract=fast

more than 98% of execution time is in function LBM_performStreamCollide, it

contains a single loop, related code is:

void LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid ) {

  for (...)

  {

     ...

                ux = + SRC_E ( srcGrid ) - SRC_W ( srcGrid )

                     + SRC_NE( srcGrid ) - SRC_NW( srcGrid )

                     + SRC_SE( srcGrid ) - SRC_SW( srcGrid )

                     + SRC_ET( srcGrid ) + SRC_EB( srcGrid )

                     - SRC_WT( srcGrid ) - SRC_WB( srcGrid );

                uy = + SRC_N ( srcGrid ) - SRC_S ( srcGrid )

                     + SRC_NE( srcGrid ) + SRC_NW( srcGrid )

                     - SRC_SE( srcGrid ) - SRC_SW( srcGrid )

                     + SRC_NT( srcGrid ) + SRC_NB( srcGrid )

                     - SRC_ST( srcGrid ) - SRC_SB( srcGrid );

                uz = + SRC_T ( srcGrid ) - SRC_B ( srcGrid )

                     + SRC_NT( srcGrid ) - SRC_NB( srcGrid )

                     + SRC_ST( srcGrid ) - SRC_SB( srcGrid )

                     + SRC_ET( srcGrid ) - SRC_EB( srcGrid )

                     + SRC_WT( srcGrid ) - SRC_WB( srcGrid );

                ux /= rho;

                uy /= rho;

                uz /= rho;

                if( TEST_FLAG_SWEEP( srcGrid, ACCEL )) {

                        ux = 0.005;

                        uy = 0.002;

                        uz = 0.000;

                }

        ...

  }

}

LLVM tranforms the code into

           if( TEST_FLAG_SWEEP( srcGrid, ACCEL )) {

                ux = + SRC_E ( srcGrid ) - SRC_W ( srcGrid )

                     + SRC_NE( srcGrid ) - SRC_NW( srcGrid )

                     + SRC_SE( srcGrid ) - SRC_SW( srcGrid )

                     + SRC_ET( srcGrid ) + SRC_EB( srcGrid )

                     - SRC_WT( srcGrid ) - SRC_WB( srcGrid );

                ux /= rho;

            }

            else 

                ux = 0.005;

            if( TEST_FLAG_SWEEP( srcGrid, ACCEL )) {

                uy = + SRC_N ( srcGrid ) - SRC_S ( srcGrid )

                     + SRC_NE( srcGrid ) + SRC_NW( srcGrid )

                     - SRC_SE( srcGrid ) - SRC_SW( srcGrid )

                     + SRC_NT( srcGrid ) + SRC_NB( srcGrid )

                     - SRC_ST( srcGrid ) - SRC_SB( srcGrid );

                uy /= rho;

             }

             else

                uy = 0.002;

             if( TEST_FLAG_SWEEP( srcGrid, ACCEL )) {

                uz = + SRC_T ( srcGrid ) - SRC_B ( srcGrid )

                     + SRC_NT( srcGrid ) - SRC_NB( srcGrid )

                     + SRC_ST( srcGrid ) - SRC_SB( srcGrid )

                     + SRC_ET( srcGrid ) - SRC_EB( srcGrid )

                     + SRC_WT( srcGrid ) - SRC_WB( srcGrid );

                uz /= rho;

             }

             else

                uz = 0.000;

Note that following floating point expressions are dependence chain containing

10 floating instructions

                uz = + SRC_T ( srcGrid ) - SRC_B ( srcGrid )

                     + SRC_NT( srcGrid ) - SRC_NB( srcGrid )

                     + SRC_ST( srcGrid ) - SRC_SB( srcGrid )

                     + SRC_ET( srcGrid ) - SRC_EB( srcGrid )

                     + SRC_WT( srcGrid ) - SRC_WB( srcGrid );

                uz /= rho;

One power8 each fp instruction has 6 or more cycle latency, so it needs at

least 60 cycles to execute each of the three dependence chain.

GCC doesn't do the control flow transform, so it can interleave the 3

dependence chains, and the result code is much faster.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>