<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - [X86][AVX512] suboptimal shuffle sequence instead of one vpermw instruction"
   href="https://bugs.llvm.org/show_bug.cgi?id=34369">34369</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>[X86][AVX512] suboptimal shuffle sequence instead of one vpermw instruction
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>All
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>All
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Backend: X86
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>ayman.musa@intel.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr></table>
      <p>
        <div>
        <pre>For the following IR:

define <16 x i16> @test(<16 x i16> %vec) {
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3,
i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32
8, i32 9, i32 10, i32 12, i32 12>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1
1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16>
zeroinitializer
   ret <16 x i16> %res
}

<span class="quote">>> llc -mcpu=skx <file-name> -o out.s</span >


LLVM emits (showing 9.52 throughput on IACA tool):
     vextracti128    $1, %ymm0, %xmm1
     vpshufb .LCPI41_0(%rip), %xmm1, %xmm2 # xmm2 =
xmm1[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5]
     vpshufb .LCPI41_1(%rip), %xmm0, %xmm0 # xmm0 =
xmm0[6,7,0,1,0,1,6,7,10,11,4,5,4,5,6,7]
     vpblendw    $136, %xmm2, %xmm0, %xmm0 # xmm0 =
xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7]
     vpshufb .LCPI41_2(%rip), %xmm1, %xmm1 # xmm1 =
xmm1[14,15,0,1,12,13,0,1,2,3,4,5,8,9,8,9]
     vinserti128 $1, %xmm1, %ymm0, %ymm0
     movw    $-1129, %ax             # imm = 0xFB97
     kmovd   %eax, %k1
     vmovdqu16   %ymm0, %ymm0 {%k1} {z}
     retq  

While it can be replaced with (showing 5.76 throughput on IACA tool):
     vmovdqu .LCPI43_0(%rip), %ymm1 # ymm1 =
[3,0,0,13,5,2,2,10,15,8,14,8,9,10,12,12]
     movw $-1129, %ax
     kmovd %eax, %k1
     vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}

     retq

** Throughput results from IACA tool => lower is better.</pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>