<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Byte swap idioms loses optimization on AVX+"

   href="https://bugs.llvm.org/show_bug.cgi?id=41545">41545</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Byte swap idioms loses optimization on AVX+

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>jed@59a2.org

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, spatel+llvm@rotateright.com

          </td>

        </tr></table>

      <p>

        <div>

        <pre>This code correctly optimizes to a simple load on x86-64:

static unsigned read_u32_le(const unsigned char arr[]) {

  return (arr[0] << 0)

    | (arr[1] << 8)

    | (arr[2] << 16)

    | (arr[3] << 24);

}

clang -O:

read_u32_le: # @read_u32_le

  mov eax, dword ptr [rdi]

  ret

However, when allowed to inline into code such as

unsigned sum_buf(int len, const unsigned char *arr) {

  unsigned sum = 0;

  for (int i=0; i<len; i+=4) {

    sum += read_u32_le(arr+i);

  }

  return sum;

}

on AVX/AVX2, the optimization is lost. For example, with -march=haswell:

.LBB0_5: # =>This Inner Loop Header: Depth=1

  vmovdqu xmm7, xmmword ptr [rsi + 4*rax]

  vmovdqu xmm0, xmmword ptr [rsi + 4*rax + 16]

  vmovdqu xmm1, xmmword ptr [rsi + 4*rax + 32]

  vmovdqu xmm2, xmmword ptr [rsi + 4*rax + 48]

  vpblendw xmm11, xmm7, xmm8, 170 # xmm11 =

xmm7[0],xmm8[1],xmm7[2],xmm8[3],xmm7[4],xmm8[5],xmm7[6],xmm8[7]

  vpblendw xmm12, xmm0, xmm8, 170 # xmm12 =

xmm0[0],xmm8[1],xmm0[2],xmm8[3],xmm0[4],xmm8[5],xmm0[6],xmm8[7]

  vpblendw xmm13, xmm1, xmm8, 170 # xmm13 =

xmm1[0],xmm8[1],xmm1[2],xmm8[3],xmm1[4],xmm8[5],xmm1[6],xmm8[7]

  vpblendw xmm14, xmm2, xmm8, 170 # xmm14 =

xmm2[0],xmm8[1],xmm2[2],xmm8[3],xmm2[4],xmm8[5],xmm2[6],xmm8[7]

  vpand xmm3, xmm7, xmm9

  vpor xmm11, xmm11, xmm3

  vpand xmm3, xmm0, xmm9

  vpor xmm12, xmm12, xmm3

  vpand xmm3, xmm1, xmm9

  vpor xmm13, xmm13, xmm3

  vpand xmm3, xmm2, xmm9

  vpor xmm3, xmm14, xmm3

  vpand xmm7, xmm7, xmm10

  vpor xmm7, xmm11, xmm7

  vpaddd xmm15, xmm7, xmm15

  vpand xmm0, xmm0, xmm10

  vpor xmm0, xmm12, xmm0

  vpaddd xmm4, xmm0, xmm4

  vpand xmm0, xmm1, xmm10

  vpor xmm0, xmm13, xmm0

  vpaddd xmm5, xmm0, xmm5

  vpand xmm0, xmm2, xmm10

  vpor xmm0, xmm3, xmm0

  vpaddd xmm6, xmm0, xmm6

  add rax, 16

  cmp rdi, rax

  jne .LBB0_5

Meanwhile, the same inner loop at x86-64 generic:

.LBB0_5: # =>This Inner Loop Header: Depth=1

  movdqu xmm2, xmmword ptr [rsi + 4*rax]

  paddd xmm0, xmm2

  movdqu xmm2, xmmword ptr [rsi + 4*rax + 16]

  paddd xmm1, xmm2

  add rax, 8

  cmp rdi, rax

  jne .LBB0_5

<a href="https://gcc.godbolt.org/z/Mop93_">https://gcc.godbolt.org/z/Mop93_</a></pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>