[llvm-bugs] [Bug 33740] New: [LLVM][X86] X86ISellowring ends with sub optimal instruction for shuffle pattern (VPUNPCKLWD) on AVX2 and above.

via llvm-bugs llvm-bugs at lists.llvm.org
Tue Jul 11 00:48:24 PDT 2017


https://bugs.llvm.org/show_bug.cgi?id=33740

            Bug ID: 33740
           Summary: [LLVM][X86] X86ISellowring ends with sub optimal
                    instruction for shuffle pattern (VPUNPCKLWD) on AVX2
                    and above.
           Product: new-bugs
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: new bugs
          Assignee: unassignedbugs at nondot.org
          Reporter: michael.zuckerman at intel.com
                CC: llvm-bugs at lists.llvm.org

The following shuffle ends with sub-optimal instruction while it can choose
otherwise.
Consider the following ll sequence.
 1 test.ll                                                                     
                                                                X 

 define void @interleaved_store(<32 x i8> %x1, <32 x i8> %x2,<32 x i8>* %p) { 
 %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <32 x i32> <i32 0,i32 1,i32
16,i32 17,i32 2,i32 3,i32 18,i32 19,i32 4,i32 5,i32 20,i32 21,i32   6,i32 7,i32
22,i32 23,i32 8,i32 9,i32 24,i32 25,i32 10,i32 11,i32 26,i32 27,i32 12,i32
13,i32 28,i32 29,i32 14,i32 15,i32 30,i32 31> 
 store <32 x i8> %v1, <32 x i8>* %p 
 ret void 
 } 

AVX  (with above ll file) ends with optimal instruction **vpunpckhwd**(case1)
while in AVX2 and above we end with sub-optimal sequence for the same ll file
(as shown in case2) 

******************************************case1*******************************************************
bash-4.2$ llc -mtriple=x86_64-pc-linux -mattr=+avx < test.ll 
        .text
        .file   "<stdin>"
        .globl  interleaved_store       # -- Begin function interleaved_store
        .p2align        4, 0x90
        .type   interleaved_store, at function
interleaved_store:                      # @interleaved_store
        .cfi_startproc
# BB#0:
        vextractf128    $1, %ymm0, %xmm1
        vpunpckhwd      %xmm1, %xmm0, %xmm2 # xmm2 =
xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
        vpunpcklwd      %xmm1, %xmm0, %xmm0 # xmm0 =
xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
        vinsertf128     $1, %xmm2, %ymm0, %ymm0
        vmovaps %ymm0, (%rdi)
        vzeroupper
        retq
.Lfunc_end0:
        .size   interleaved_store, .Lfunc_end0-interleaved_store
        .cfi_endproc
                                        # -- End function

        .section        ".note.GNU-stack","", at progbits

*******************************************************************************************************

****************************************case2********************************************************
bash-4.2$ llc -mtriple=x86_64-pc-linux -mattr=+avx2 < test.ll 
        .text
        .file   "<stdin>"
        .section        .rodata.cst32,"aM", at progbits,32
        .p2align        5               # -- Begin function interleaved_store
.LCPI0_0:
        .byte   8                       # 0x8
        .byte   9                       # 0x9
        .byte   0                       # 0x0
        .byte   1                       # 0x1
        .byte   10                      # 0xa
        .byte   11                      # 0xb
        .byte   2                       # 0x2
        .byte   3                       # 0x3
        .byte   12                      # 0xc
        .byte   13                      # 0xd
        .byte   4                       # 0x4
        .byte   5                       # 0x5
        .byte   14                      # 0xe
        .byte   15                      # 0xf
        .byte   6                       # 0x6
        .byte   7                       # 0x7
        .byte   24                      # 0x18
        .byte   25                      # 0x19
        .byte   16                      # 0x10
        .byte   17                      # 0x11
        .byte   26                      # 0x1a
        .byte   27                      # 0x1b
        .byte   18                      # 0x12
        .byte   19                      # 0x13
        .byte   28                      # 0x1c
        .byte   29                      # 0x1d
        .byte   20                      # 0x14
        .byte   21                      # 0x15
        .byte   30                      # 0x1e
        .byte   31                      # 0x1f
        .byte   22                      # 0x16
        .byte   23                      # 0x17
.LCPI0_1:
        .byte   0                       # 0x0
        .byte   1                       # 0x1
        .byte   8                       # 0x8
        .byte   9                       # 0x9
        .byte   2                       # 0x2
        .byte   3                       # 0x3
        .byte   10                      # 0xa
        .byte   11                      # 0xb
        .byte   4                       # 0x4
        .byte   5                       # 0x5
        .byte   12                      # 0xc
        .byte   13                      # 0xd
        .byte   6                       # 0x6
        .byte   7                       # 0x7
        .byte   14                      # 0xe
        .byte   15                      # 0xf
        .byte   16                      # 0x10
        .byte   17                      # 0x11
        .byte   24                      # 0x18
        .byte   25                      # 0x19
        .byte   18                      # 0x12
        .byte   19                      # 0x13
        .byte   26                      # 0x1a
        .byte   27                      # 0x1b
        .byte   20                      # 0x14
        .byte   21                      # 0x15
        .byte   28                      # 0x1c
        .byte   29                      # 0x1d
        .byte   22                      # 0x16
        .byte   23                      # 0x17
        .byte   30                      # 0x1e
        .byte   31                      # 0x1f
.LCPI0_2:
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .byte   0                       # 0x0
        .byte   0                       # 0x0
        .byte   255                     # 0xff
        .byte   255                     # 0xff
        .text
        .globl  interleaved_store
        .p2align        4, 0x90
        .type   interleaved_store, at function
interleaved_store:                      # @interleaved_store
        .cfi_startproc
# BB#0:
        vperm2i128      $35, %ymm0, %ymm0, %ymm1 # ymm1 = ymm0[2,3,0,1]
        vpshufb .LCPI0_0(%rip), %ymm1, %ymm1 # ymm1 =
ymm1[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23]
        vpshufb .LCPI0_1(%rip), %ymm0, %ymm0 # ymm0 =
ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
        vmovdqa .LCPI0_2(%rip), %ymm2   # ymm2 =
[255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255]
        vpblendvb       %ymm2, %ymm0, %ymm1, %ymm0
        vmovdqa %ymm0, (%rdi)
        vzeroupper
        retq

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170711/f8b59e50/attachment-0001.html>


More information about the llvm-bugs mailing list