<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - [LLVM][X86] X86ISellowring ends with sub optimal instruction for shuffle pattern (VPUNPCKLWD) on AVX2 and above."
   href="https://bugs.llvm.org/show_bug.cgi?id=33740">33740</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>[LLVM][X86] X86ISellowring ends with sub optimal instruction for shuffle pattern (VPUNPCKLWD) on AVX2 and above.
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>new-bugs
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>All
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>new bugs
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>michael.zuckerman@intel.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr></table>
      <p>
        <div>
        <pre>The following shuffle ends with sub-optimal instruction while it can choose
otherwise.
Consider the following ll sequence.
 1 test.ll                                                                     
                                                                X 

 define void @interleaved_store(<32 x i8> %x1, <32 x i8> %x2,<32 x i8>* %p) { 
 %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <32 x i32> <i32 0,i32 1,i32
16,i32 17,i32 2,i32 3,i32 18,i32 19,i32 4,i32 5,i32 20,i32 21,i32   6,i32 7,i32
22,i32 23,i32 8,i32 9,i32 24,i32 25,i32 10,i32 11,i32 26,i32 27,i32 12,i32
13,i32 28,i32 29,i32 14,i32 15,i32 30,i32 31> 
 store <32 x i8> %v1, <32 x i8>* %p 
 ret void 
 } 

AVX  (with above ll file) ends with optimal instruction **vpunpckhwd**(case1)
while in AVX2 and above we end with sub-optimal sequence for the same ll file
(as shown in case2) 

******************************************case1*******************************************************
bash-4.2$ llc -mtriple=x86_64-pc-linux -mattr=+avx < test.ll 
        .text
        .file   "<stdin>"
        .globl  interleaved_store       # -- Begin function interleaved_store
        .p2align        4, 0x90
        .type   interleaved_store,@function
interleaved_store:                      # @interleaved_store
        .cfi_startproc
# BB#0:
        vextractf128    $1, %ymm0, %xmm1
        vpunpckhwd      %xmm1, %xmm0, %xmm2 # xmm2 =
xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
        vpunpcklwd      %xmm1, %xmm0, %xmm0 # xmm0 =
xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
        vinsertf128     $1, %xmm2, %ymm0, %ymm0
        vmovaps %ymm0, (%rdi)
        vzeroupper
        retq
.Lfunc_end0:
        .size   interleaved_store, .Lfunc_end0-interleaved_store
        .cfi_endproc
                                        # -- End function

        .section        ".note.GNU-stack","",@progbits

*******************************************************************************************************

****************************************case2********************************************************
bash-4.2$ llc -mtriple=x86_64-pc-linux -mattr=+avx2 < test.ll 
        .text
        .file   "<stdin>"
        .section        .rodata.cst32,"aM",@progbits,32
        .p2align        5               # -- Begin function interleaved_store
.LCPI0_0:
        .byte   8                       # 0x8
        .byte   9                       # 0x9
        .byte   0                       # 0x0
        .byte   1                       # 0x1
        .byte   10                      # 0xa
        .byte   11                      # 0xb
        .byte   2                       # 0x2
        .byte   3                       # 0x3
        .byte   12                      # 0xc
        .byte   13                      # 0xd
        .byte   4                       # 0x4
        .byte   5                       # 0x5
        .byte   14                      # 0xe
        .byte   15                      # 0xf
        .byte   6                       # 0x6
        .byte   7                       # 0x7
        .byte   24                      # 0x18
        .byte   25                      # 0x19
        .byte   16                      # 0x10
        .byte   17                      # 0x11
        .byte   26                      # 0x1a
        .byte   27                      # 0x1b
        .byte   18                      # 0x12
        .byte   19                      # 0x13
        .byte   28                      # 0x1c
        .byte   29                      # 0x1d
        .byte   20                      # 0x14
        .byte   21                      # 0x15
        .byte   30                      # 0x1e
        .byte   31                      # 0x1f
        .byte   22                      # 0x16
        .byte   23                      # 0x17
.LCPI0_1:
        .byte   0                       # 0x0
        .byte   1                       # 0x1
        .byte   8                       # 0x8
        .byte   9                       # 0x9
        .byte   2                       # 0x2
        .byte   3                       # 0x3
        .byte   10                      # 0xa
        .byte   11                      # 0xb
        .byte   4                       # 0x4
        .byte   5                       # 0x5
        .byte   12                      # 0xc
        .byte   13                      # 0xd
        .byte   6                       # 0x6
        .byte   7                       # 0x7
        .byte   14                      # 0xe
        .byte   15                      # 0xf
        .byte   16                      # 0x10
        .byte   17                      # 0x11
        .byte   24                      # 0x18
        .byte   25                      # 0x19
        .byte   18                      # 0x12
        .byte   19                      # 0x13
        .byte   26                      # 0x1a
        .byte   27                      # 0x1b
        .byte   20                      # 0x14
        .byte   21                      # 0x15
        .byte   28                      # 0x1c
        .byte   29                      # 0x1d
        .byte   22                      # 0x16
        .byte   23                      # 0x17
        .byte   30                      # 0x1e
        .byte   31                      # 0x1f
.LCPI0_2:
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .text
        .globl  interleaved_store
        .p2align        4, 0x90
        .type   interleaved_store,@function
interleaved_store:                      # @interleaved_store
        .cfi_startproc
# BB#0:
        vperm2i128      $35, %ymm0, %ymm0, %ymm1 # ymm1 = ymm0[2,3,0,1]
        vpshufb .LCPI0_0(%rip), %ymm1, %ymm1 # ymm1 =
ymm1[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23]
        vpshufb .LCPI0_1(%rip), %ymm0, %ymm0 # ymm0 =
ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
        vmovdqa .LCPI0_2(%rip), %ymm2   # ymm2 =
[255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255]
        vpblendvb       %ymm2, %ymm0, %ymm1, %ymm0
        vmovdqa %ymm0, (%rdi)
        vzeroupper
        retq</pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>