[LLVMbugs] [Bug 21281] New: Performance regression in vector interleave

bugzilla-daemon at llvm.org bugzilla-daemon at llvm.org
Tue Oct 14 16:17:55 PDT 2014


http://llvm.org/bugs/show_bug.cgi?id=21281

            Bug ID: 21281
           Summary: Performance regression in vector interleave
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: andrew.b.adams at gmail.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

Created attachment 13204
  --> http://llvm.org/bugs/attachment.cgi?id=13204&action=edit
ll to reproduce

The attached ll interleaves 8 8xi16 vectors into a single 64xi16 vector. It's
used as part of a fast transpose routine. This particular formulation is used
because it generates great code with llvm <= 3.5 on x86 and arm. arm is still
superb with trunk (12 zip instructions!), but x86 has regressed in trunk, and
gives us a >20% slowdown for a transpose benchmark.

Old asm from llvm release 3.5. 36 instructions.

    movdqa    %xmm0, %xmm8
    punpcklwd    %xmm1, %xmm8    
    punpckhwd    %xmm1, %xmm0    
    movdqa    %xmm2, %xmm1
    punpcklwd    %xmm3, %xmm1    
    punpckhwd    %xmm3, %xmm2    
    movdqa    %xmm0, %xmm3
    punpckhdq    %xmm2, %xmm3    
    punpckldq    %xmm2, %xmm0    
    movdqa    %xmm8, %xmm2
    punpckhdq    %xmm1, %xmm2    
    punpckldq    %xmm1, %xmm8    
    movdqa    %xmm4, %xmm1
    punpcklwd    %xmm5, %xmm1    
    punpckhwd    %xmm5, %xmm4    
    movdqa    %xmm7, %xmm5
    punpcklwd    %xmm6, %xmm5    
    punpckhwd    %xmm6, %xmm7    
    movdqa    %xmm4, %xmm6
    punpckhdq    %xmm7, %xmm6    
    punpckldq    %xmm7, %xmm4    
    movdqa    %xmm1, %xmm7
    punpckhdq    %xmm5, %xmm7    
    punpckldq    %xmm5, %xmm1    
    movdqa    %xmm8, %xmm5
    punpcklwd    %xmm1, %xmm5    
    punpckhwd    %xmm1, %xmm8    
    movdqa    %xmm2, %xmm1
    punpcklwd    %xmm7, %xmm1    
    punpckhwd    %xmm7, %xmm2    
    movdqa    %xmm0, %xmm7
    punpcklwd    %xmm4, %xmm7    
    punpckhwd    %xmm4, %xmm0    
    movdqa    %xmm3, %xmm4
    punpcklwd    %xmm6, %xmm4    
    punpckhwd    %xmm6, %xmm3

New asm from llvm trunk. 44 instructions.

        pshufd    $78, %xmm1, %xmm9       
    pshufd    $78, %xmm0, %xmm8       
    punpcklwd    %xmm9, %xmm8    
    punpcklwd    %xmm1, %xmm0    
    pshufd    $78, %xmm3, %xmm9       
    pshufd    $78, %xmm2, %xmm1       
    punpcklwd    %xmm9, %xmm1    
    punpcklwd    %xmm3, %xmm2    
    movdqa    %xmm0, %xmm9
    punpckhdq    %xmm2, %xmm9    
    punpckldq    %xmm2, %xmm0    
    movdqa    %xmm8, %xmm2
    punpckhdq    %xmm1, %xmm2    
    punpckldq    %xmm1, %xmm8    
    pshufd    $78, %xmm5, %xmm1       
    pshufd    $78, %xmm4, %xmm3       
    punpcklwd    %xmm1, %xmm3    
    punpcklwd    %xmm5, %xmm4    
    pshufd    $78, %xmm6, %xmm1       
    pshufd    $78, %xmm7, %xmm5       
    punpcklwd    %xmm1, %xmm5    
    punpcklwd    %xmm6, %xmm7    
    movdqa    %xmm4, %xmm1
    punpckhdq    %xmm7, %xmm1    
    punpckldq    %xmm7, %xmm4    
    movdqa    %xmm3, %xmm6
    punpckhdq    %xmm5, %xmm6    
    punpckldq    %xmm5, %xmm3    
    pshufd    $78, %xmm3, %xmm5       
    pshufd    $78, %xmm8, %xmm11      
    punpcklwd    %xmm5, %xmm11   
    pshufd    $78, %xmm6, %xmm10      
    pshufd    $78, %xmm2, %xmm12      
    punpcklwd    %xmm10, %xmm12  
    pshufd    $78, %xmm4, %xmm10      
    pshufd    $78, %xmm0, %xmm7       
    punpcklwd    %xmm10, %xmm7   
    pshufd    $78, %xmm1, %xmm10      
    pshufd    $78, %xmm9, %xmm5       
    punpcklwd    %xmm10, %xmm5   
    punpcklwd    %xmm3, %xmm8    
    punpcklwd    %xmm6, %xmm2    
    punpcklwd    %xmm4, %xmm0    
    punpcklwd    %xmm1, %xmm9

The difference is even more stark with avx enabled:

llvm 3.5. 28 instructions.

     vpunpcklwd    %xmm1, %xmm0, %xmm8 
    vpunpckhwd    %xmm1, %xmm0, %xmm0 
    vpunpcklwd    %xmm3, %xmm2, %xmm1 
    vpunpckhwd    %xmm3, %xmm2, %xmm2 
    vpunpckhdq    %xmm2, %xmm0, %xmm9 
    vpunpckldq    %xmm2, %xmm0, %xmm2 
    vpunpckhdq    %xmm1, %xmm8, %xmm3 
    vpunpckldq    %xmm1, %xmm8, %xmm0 
    vpunpcklwd    %xmm5, %xmm4, %xmm1 
    vpunpckhwd    %xmm5, %xmm4, %xmm4 
    vpunpcklwd    %xmm6, %xmm7, %xmm5 
    vpunpckhwd    %xmm6, %xmm7, %xmm6 
    vpunpckhdq    %xmm6, %xmm4, %xmm7 
    vpunpckldq    %xmm6, %xmm4, %xmm4 
    vpunpckhdq    %xmm5, %xmm1, %xmm6 
    vpunpckldq    %xmm5, %xmm1, %xmm1 
    vpunpckhwd    %xmm1, %xmm0, %xmm5 
    vpunpcklwd    %xmm1, %xmm0, %xmm0 
    vinsertf128    $1, %xmm5, %ymm0, %ymm0
    vpunpckhwd    %xmm6, %xmm3, %xmm1 
    vpunpcklwd    %xmm6, %xmm3, %xmm3 
    vinsertf128    $1, %xmm1, %ymm3, %ymm1
    vpunpckhwd    %xmm4, %xmm2, %xmm3 
    vpunpcklwd    %xmm4, %xmm2, %xmm2 
    vinsertf128    $1, %xmm3, %ymm2, %ymm2
    vpunpckhwd    %xmm7, %xmm9, %xmm3 
    vpunpcklwd    %xmm7, %xmm9, %xmm4 
    vinsertf128    $1, %xmm3, %ymm4, %ymm3

llvm trunk. 84 instructions.
    vpunpcklwd    %xmm1, %xmm1, %xmm8 
    vpmovzxwd    %xmm0, %xmm9
    vpblendw    $-86, %xmm8, %xmm9, %xmm8 
    vpunpckhwd    %xmm1, %xmm1, %xmm1 
    vpunpckhwd    %xmm0, %xmm0, %xmm0 
    vpblendw    $-86, %xmm1, %xmm0, %xmm0 
    vpunpcklwd    %xmm3, %xmm3, %xmm9 
    vpmovzxwd    %xmm2, %xmm1
    vpblendw    $-86, %xmm9, %xmm1, %xmm1 
    vpunpckhwd    %xmm3, %xmm3, %xmm3 
    vpunpckhwd    %xmm2, %xmm2, %xmm2 
    vpblendw    $-86, %xmm3, %xmm2, %xmm2 
    vpermilps    $96, %xmm2, %xmm3 
    vpermilps    $-24, %xmm2, %xmm2 
    vinsertf128    $1, %xmm2, %ymm3, %ymm2
    vpermilps    $-44, %xmm0, %xmm3 
    vpermilps    $-10, %xmm0, %xmm0 
    vinsertf128    $1, %xmm0, %ymm3, %ymm0
    vblendps    $-86, %ymm2, %ymm0, %ymm3 
    vpermilps    $96, %xmm1, %xmm0 
    vpermilps    $-24, %xmm1, %xmm1 
    vinsertf128    $1, %xmm1, %ymm0, %ymm0
    vpermilps    $-44, %xmm8, %xmm1 
    vpermilps    $-10, %xmm8, %xmm2 
    vinsertf128    $1, %xmm2, %ymm1, %ymm1
    vblendps    $-86, %ymm0, %ymm1, %ymm1 
    vpunpcklwd    %xmm5, %xmm5, %xmm0 
    vpmovzxwd    %xmm4, %xmm2
    vpblendw    $-86, %xmm0, %xmm2, %xmm0 
    vpunpckhwd    %xmm5, %xmm5, %xmm2 
    vpunpckhwd    %xmm4, %xmm4, %xmm4 
    vpblendw    $-86, %xmm2, %xmm4, %xmm2 
    vpunpcklwd    %xmm6, %xmm6, %xmm4 
    vpmovzxwd    %xmm7, %xmm5
    vpblendw    $-86, %xmm4, %xmm5, %xmm5 
    vpunpckhwd    %xmm6, %xmm6, %xmm4 
    vpunpckhwd    %xmm7, %xmm7, %xmm6 
    vpblendw    $-86, %xmm4, %xmm6, %xmm4 
    vpermilps    $96, %xmm4, %xmm6 
    vpermilps    $-24, %xmm4, %xmm4 
    vinsertf128    $1, %xmm4, %ymm6, %ymm4
    vpermilps    $-44, %xmm2, %xmm6 
    vpermilps    $-10, %xmm2, %xmm2 
    vinsertf128    $1, %xmm2, %ymm6, %ymm2
    vblendps    $-86, %ymm4, %ymm2, %ymm4 
    vpermilps    $96, %xmm5, %xmm2 
    vpermilps    $-24, %xmm5, %xmm5 
    vinsertf128    $1, %xmm5, %ymm2, %ymm2
    vpermilps    $-44, %xmm0, %xmm5 
    vpermilps    $-10, %xmm0, %xmm0 
    vinsertf128    $1, %xmm0, %ymm5, %ymm0
    vblendps    $-86, %ymm2, %ymm0, %ymm2 
    vpunpckhwd    %xmm2, %xmm2, %xmm0 
    vpunpckhwd    %xmm1, %xmm1, %xmm5 
    vpblendw    $-86, %xmm0, %xmm5, %xmm0 
    vpunpcklwd    %xmm2, %xmm2, %xmm5 
    vpmovzxwd    %xmm1, %xmm6
    vpblendw    $-86, %xmm5, %xmm6, %xmm5 
    vinsertf128    $1, %xmm0, %ymm5, %ymm0
    vextractf128    $1, %ymm2, %xmm2
    vpunpckhwd    %xmm2, %xmm2, %xmm5 
    vextractf128    $1, %ymm1, %xmm1
    vpunpckhwd    %xmm1, %xmm1, %xmm6 
    vpblendw    $-86, %xmm5, %xmm6, %xmm5 
    vpunpcklwd    %xmm2, %xmm2, %xmm2 
    vpmovzxwd    %xmm1, %xmm1
    vpblendw    $-86, %xmm2, %xmm1, %xmm1 
    vinsertf128    $1, %xmm5, %ymm1, %ymm1
    vpunpckhwd    %xmm4, %xmm4, %xmm2 
    vpunpckhwd    %xmm3, %xmm3, %xmm5 
    vpblendw    $-86, %xmm2, %xmm5, %xmm2 
    vpunpcklwd    %xmm4, %xmm4, %xmm5 
    vpmovzxwd    %xmm3, %xmm6
    vpblendw    $-86, %xmm5, %xmm6, %xmm5 
    vinsertf128    $1, %xmm2, %ymm5, %ymm2
    vextractf128    $1, %ymm4, %xmm4
    vpunpckhwd    %xmm4, %xmm4, %xmm5 
    vextractf128    $1, %ymm3, %xmm3
    vpunpckhwd    %xmm3, %xmm3, %xmm6 
    vpblendw    $-86, %xmm5, %xmm6, %xmm5 
    vpunpcklwd    %xmm4, %xmm4, %xmm4 
    vpmovzxwd    %xmm3, %xmm3
    vpblendw    $-86, %xmm4, %xmm3, %xmm3 
    vinsertf128    $1, %xmm5, %ymm3, %ymm3

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20141014/2d9cd742/attachment.html>


More information about the llvm-bugs mailing list