<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - Suboptimal x86 vector extend patterns"
   href="https://bugs.llvm.org/show_bug.cgi?id=47283">47283</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>Suboptimal x86 vector extend patterns
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Backend: X86
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>david.bolvansky@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, spatel+llvm@rotateright.com
          </td>
        </tr></table>
      <p>
        <div>
        <pre>typedef unsigned char v32qi __attribute__((vector_size (32)));
typedef unsigned long long v4di __attribute__((vector_size (32)));

void
bar_u8_u64 (v4di * dst, v32qi src)
{
  unsigned long long tem[4];

  for (int i = 0; i < 4; i++)
    tem[i] = src[i];

  dst[0] = *(v4di *) tem;
}


Clang -O3:
bar_u8_u64(unsigned long long __vector(4)*, unsigned char __vector(32)):       
     # @bar_u8_u64(unsigned long long __vector(4)*, unsigned char __vector(32))
        push    rbp
        mov     rbp, rsp
        and     rsp, -32
        sub     rsp, 64
        movdqa  xmm0, xmmword ptr [rbp + 16]
        pxor    xmm1, xmm1
        punpcklbw       xmm0, xmm1              # xmm0 =
xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
        punpcklwd       xmm0, xmm1              # xmm0 =
xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
        movdqa  xmm2, xmm0
        punpckldq       xmm2, xmm1              # xmm2 =
xmm2[0],xmm1[0],xmm2[1],xmm1[1]
        movdqa  xmmword ptr [rsp + 16], xmm2
        punpckhdq       xmm0, xmm1              # xmm0 =
xmm0[2],xmm1[2],xmm0[3],xmm1[3]
        movdqa  xmmword ptr [rsp + 32], xmm0
        movdqa  xmmword ptr [rdi + 16], xmm0
        movdqa  xmmword ptr [rdi], xmm2
        mov     rsp, rbp
        pop     rbp
        ret

Dispatch Width:    6
uOps Per Cycle:    1.59
IPC:               1.06
Block RThroughput: 5.0

ICC:
bar_u8_u64(unsigned long long __vector(4)*, unsigned char __vector(32)):
        movd      xmm2, DWORD PTR [8+rsp]                       #10.18
        pxor      xmm0, xmm0                                    #10.18
        movdqa    xmm1, xmm2                                    #10.18
        psrldq    xmm2, 2                                       #10.18
        punpcklbw xmm1, xmm0                                    #10.18
        punpcklbw xmm2, xmm0                                    #10.18
        punpcklwd xmm1, xmm0                                    #10.18
        punpcklwd xmm2, xmm0                                    #10.18
        punpckldq xmm1, xmm0                                    #10.18
        punpckldq xmm2, xmm0                                    #10.18
        movdqu    XMMWORD PTR [rdi], xmm1                       #12.3
        movdqu    XMMWORD PTR [16+rdi], xmm2                    #12.3
        ret

Dispatch Width:    6
uOps Per Cycle:    2.40
IPC:               1.83
Block RThroughput: 7.0


<a href="https://godbolt.org/z/1xP3j8">https://godbolt.org/z/1xP3j8</a></pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>