<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - rob pike endianness trick not getting optimized anymore"

   href="https://bugs.llvm.org/show_bug.cgi?id=50364">50364</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>rob pike endianness trick not getting optimized anymore

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>clang

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>12.0

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>LLVM Codegen

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedclangbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>soap@gentoo.org

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org, neeilans@live.com, richard-llvm@metafoo.co.uk

          </td>

        </tr></table>

      <p>

        <div>

        <pre>I have the following repo with Rob Pike's endianness tricks:

<a href="https://github.com/SoapGentoo/portable-endianness">https://github.com/SoapGentoo/portable-endianness</a>

It seems Clang 12 broke the 64-bit store routines.

Clang 11.0.1:

store64_to_LE(unsigned long, unsigned char*):                   #

@store64_to_LE(unsigned long, unsigned char*)

        mov     qword ptr [rsi], rdi

        ret

store64_to_BE(unsigned long, unsigned char*):                   #

@store64_to_BE(unsigned long, unsigned char*)

        movbe   qword ptr [rsi], rdi

        ret

Clang 12.0.0:

.LCPI8_0:

        .quad   8                               # 0x8

        .quad   16                              # 0x10

        .quad   24                              # 0x18

        .quad   32                              # 0x20

.LCPI8_1:

        .byte   0                               # 0x0

        .byte   8                               # 0x8

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

store64_to_LE(unsigned long, unsigned char*):                   #

@store64_to_LE(unsigned long, unsigned char*)

        mov     byte ptr [rsi], dil

        vmovq   xmm0, rdi

        vpbroadcastq    ymm0, xmm0

        vpsrlvq ymm0, ymm0, ymmword ptr [rip + .LCPI8_0]

        vextracti128    xmm1, ymm0, 1

        vmovdqa xmm2, xmmword ptr [rip + .LCPI8_1] # xmm2 =

<0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>

        vpshufb xmm1, xmm1, xmm2

        vpshufb xmm0, xmm0, xmm2

        vpunpcklwd      xmm0, xmm0, xmm1        # xmm0 =

xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]

        vmovd   dword ptr [rsi + 1], xmm0

        mov     rax, rdi

        shr     rax, 40

        mov     byte ptr [rsi + 5], al

        mov     rax, rdi

        shr     rax, 48

        mov     byte ptr [rsi + 6], al

        shr     rdi, 56

        mov     byte ptr [rsi + 7], dil

        vzeroupper

        ret

.LCPI11_0:

        .quad   56                              # 0x38

        .quad   48                              # 0x30

        .quad   40                              # 0x28

        .quad   32                              # 0x20

.LCPI11_1:

        .byte   0                               # 0x0

        .byte   8                               # 0x8

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

        .zero   1

store64_to_BE(unsigned long, unsigned char*):                   #

@store64_to_BE(unsigned long, unsigned char*)

        mov     rax, rdi

        vmovq   xmm0, rdi

        vpbroadcastq    ymm0, xmm0

        vpsrlvq ymm0, ymm0, ymmword ptr [rip + .LCPI11_0]

        vextracti128    xmm1, ymm0, 1

        vmovdqa xmm2, xmmword ptr [rip + .LCPI11_1] # xmm2 =

<0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>

        vpshufb xmm1, xmm1, xmm2

        vpshufb xmm0, xmm0, xmm2

        vpunpcklwd      xmm0, xmm0, xmm1        # xmm0 =

xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]

        vmovd   dword ptr [rsi], xmm0

        mov     rcx, rdi

        shr     rcx, 24

        mov     byte ptr [rsi + 4], cl

        mov     rcx, rdi

        shr     rcx, 16

        mov     byte ptr [rsi + 5], cl

        mov     byte ptr [rsi + 6], ah

        mov     byte ptr [rsi + 7], al

        vzeroupper

        ret

I consider detecting this sort of pattern very important in order to write

performant, portable code.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>