<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - AArch64 unneccessary widening lowers vector performance"

   href="https://bugs.llvm.org/show_bug.cgi?id=46888">46888</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>AArch64 unneccessary widening lowers vector performance

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>tools

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>Other

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>opt

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>joel.hutton@arm.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>for the following test snippet:

include <math.h>

int arrSum(unsigned char a1, int inc_a1, unsigned char a2,

                                int inc_a2) {

  int sum = 0;

  for (int y = 0; y < 16; y++) {

    for (int x = 0; x < 16; x++) {

      sum += abs(a1[x] - a2[x]);

    }

    a1 += inc_a1;

    a2 += inc_a2;

  }

  return sum;

}

using clang -O3:

LLVM trunk widens bytes to 16 bit halfs with ushll instructions and then takes

the absolute differences of the halfs.

    sxtw    x8, w3

    ldr     q4, [x0]

    ldr     q5, [x2]

    add     x10, x0, x9

    add     x11, x2, x8

    ldr     q0, [x10]

    ldr     q1, [x11]

    add     x10, x10, x9

    add     x11, x11, x8

    ldr     q2, [x10]

    ldr     q3, [x11]

    add     x10, x10, x9

    add     x11, x11, x8

    ushll   v7.8h, v4.8b, #0

    ushll2  v16.8h, v4.16b, #0

    ushll   v17.8h, v5.8b, #0

    ushll2  v5.8h, v5.16b, #0

    ldr     q4, [x10]

    ldr     q6, [x11]

    uabdl   v18.4s, v16.4h, v5.4h

    uabdl   v19.4s, v7.4h, v17.4h

    uabdl2  v16.4s, v16.8h, v5.8h

    uabdl2  v17.4s, v7.8h, v17.8h

This is wider than necessary and the difference operation can be performed on

the bytes directly. Performing the operation directly on bytes processes 8

lanes at a time instead of 4 as well as avoiding unneccesary shifts. This can

be seen in the GCC codegen.

For gcc (trunk) -O3:

    sxtw    x2, w3

    sxtw    x3, w1

    add     x11, x5, x2

    ldr     q1, [x4]

    add     x9, x11, x2

    ldr     q5, [x5]

    add     x7, x9, x2

    ldr     q3, [x4, w1, sxtw]

    add     x4, x4, x3

    uabdl2  v2.8h, v1.16b, v5.16b

    add     x10, x4, x3

    ldr     q4, [x5, w2, sxtw]

    add     x8, x10, x3

    movi    v0.4s, 0

    add     x6, x8, x3

    uabal   v2.8h, v1.8b, v5.8b

    add     x5, x7, x2

    uabdl2  v1.8h, v3.16b, v4.16b

    add     x19, x5, x2

    ldr     q6, [x11, w2, sxtw]

    mov     x0, x2

    ldr     q5, [x4, w1, sxtw]

    add     x4, x6, x3

    uabal   v1.8h, v3.8b, v4.8b

    add     x30, x4, x3

    uadalp  v0.4s, v2.8h

    add     x18, x19, x2

    uabdl2  v2.8h, v5.16b, v6.16b

This affects SPEC2017 x264 performance.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>