<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - rob pike endianness trick not getting optimized anymore"
   href="https://bugs.llvm.org/show_bug.cgi?id=50364">50364</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>rob pike endianness trick not getting optimized anymore
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>clang
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>12.0
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>LLVM Codegen
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedclangbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>soap@gentoo.org
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org, neeilans@live.com, richard-llvm@metafoo.co.uk
          </td>
        </tr></table>
      <p>
        <div>
        <pre>I have the following repo with Rob Pike's endianness tricks:
<a href="https://github.com/SoapGentoo/portable-endianness">https://github.com/SoapGentoo/portable-endianness</a>

It seems Clang 12 broke the 64-bit store routines.

Clang 11.0.1:
store64_to_LE(unsigned long, unsigned char*):                   #
@store64_to_LE(unsigned long, unsigned char*)
        mov     qword ptr [rsi], rdi
        ret
store64_to_BE(unsigned long, unsigned char*):                   #
@store64_to_BE(unsigned long, unsigned char*)
        movbe   qword ptr [rsi], rdi
        ret

Clang 12.0.0:
.LCPI8_0:
        .quad   8                               # 0x8
        .quad   16                              # 0x10
        .quad   24                              # 0x18
        .quad   32                              # 0x20
.LCPI8_1:
        .byte   0                               # 0x0
        .byte   8                               # 0x8
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
store64_to_LE(unsigned long, unsigned char*):                   #
@store64_to_LE(unsigned long, unsigned char*)
        mov     byte ptr [rsi], dil
        vmovq   xmm0, rdi
        vpbroadcastq    ymm0, xmm0
        vpsrlvq ymm0, ymm0, ymmword ptr [rip + .LCPI8_0]
        vextracti128    xmm1, ymm0, 1
        vmovdqa xmm2, xmmword ptr [rip + .LCPI8_1] # xmm2 =
<0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
        vpshufb xmm1, xmm1, xmm2
        vpshufb xmm0, xmm0, xmm2
        vpunpcklwd      xmm0, xmm0, xmm1        # xmm0 =
xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
        vmovd   dword ptr [rsi + 1], xmm0
        mov     rax, rdi
        shr     rax, 40
        mov     byte ptr [rsi + 5], al
        mov     rax, rdi
        shr     rax, 48
        mov     byte ptr [rsi + 6], al
        shr     rdi, 56
        mov     byte ptr [rsi + 7], dil
        vzeroupper
        ret
.LCPI11_0:
        .quad   56                              # 0x38
        .quad   48                              # 0x30
        .quad   40                              # 0x28
        .quad   32                              # 0x20
.LCPI11_1:
        .byte   0                               # 0x0
        .byte   8                               # 0x8
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
store64_to_BE(unsigned long, unsigned char*):                   #
@store64_to_BE(unsigned long, unsigned char*)
        mov     rax, rdi
        vmovq   xmm0, rdi
        vpbroadcastq    ymm0, xmm0
        vpsrlvq ymm0, ymm0, ymmword ptr [rip + .LCPI11_0]
        vextracti128    xmm1, ymm0, 1
        vmovdqa xmm2, xmmword ptr [rip + .LCPI11_1] # xmm2 =
<0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
        vpshufb xmm1, xmm1, xmm2
        vpshufb xmm0, xmm0, xmm2
        vpunpcklwd      xmm0, xmm0, xmm1        # xmm0 =
xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
        vmovd   dword ptr [rsi], xmm0
        mov     rcx, rdi
        shr     rcx, 24
        mov     byte ptr [rsi + 4], cl
        mov     rcx, rdi
        shr     rcx, 16
        mov     byte ptr [rsi + 5], cl
        mov     byte ptr [rsi + 6], ah
        mov     byte ptr [rsi + 7], al
        vzeroupper
        ret

I consider detecting this sort of pattern very important in order to write
performant, portable code.</pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>